From 9e11bd6d9c1543f5d99082f7836dafa99d3df847 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 26 Sep 2023 17:38:19 -0700 Subject: [PATCH 001/202] Remove AutoPool support --- src/TesApi.Tests/BatchQuotaVerifierTests.cs | 4 +- src/TesApi.Tests/BatchSchedulerTests.cs | 238 +++++----------- ...eteCompletedBatchJobsHostedServiceTests.cs | 111 -------- ...leteOrphanedBatchJobsHostedServiceTests.cs | 113 -------- ...eteOrphanedBatchPoolsHostedServiceTests.cs | 43 --- src/TesApi.Web/AzureProxy.cs | 148 ++-------- src/TesApi.Web/BatchPoolService.cs | 16 +- src/TesApi.Web/BatchScheduler.BatchPools.cs | 32 +-- src/TesApi.Web/BatchScheduler.cs | 256 +++++------------- .../CachingWithRetriesAzureProxy.cs | 8 +- .../DeleteCompletedBatchJobsHostedService.cs | 132 --------- .../DeleteOrphanedAutoPoolsHostedService.cs | 102 ------- .../DeleteOrphanedBatchJobsHostedService.cs | 120 -------- src/TesApi.Web/IAzureProxy.cs | 20 +- .../Management/BatchQuotaVerifier.cs | 16 +- .../Management/IBatchQuotaVerifier.cs | 3 +- .../Options/BatchSchedulingOptions.cs | 14 +- src/TesApi.Web/Startup.cs | 5 +- .../helm/templates/tes-deployment.yaml | 2 - .../scripts/helm/values-template.yaml | 1 - 20 files changed, 182 insertions(+), 1202 deletions(-) delete mode 100644 src/TesApi.Tests/DeleteCompletedBatchJobsHostedServiceTests.cs delete mode 100644 src/TesApi.Tests/DeleteOrphanedBatchJobsHostedServiceTests.cs delete mode 100644 src/TesApi.Tests/DeleteOrphanedBatchPoolsHostedServiceTests.cs delete mode 100644 src/TesApi.Web/DeleteCompletedBatchJobsHostedService.cs delete mode 100644 src/TesApi.Web/DeleteOrphanedAutoPoolsHostedService.cs delete mode 100644 src/TesApi.Web/DeleteOrphanedBatchJobsHostedService.cs diff --git a/src/TesApi.Tests/BatchQuotaVerifierTests.cs b/src/TesApi.Tests/BatchQuotaVerifierTests.cs index d2bb3f7bb..f629e1f81 100644 --- a/src/TesApi.Tests/BatchQuotaVerifierTests.cs +++ b/src/TesApi.Tests/BatchQuotaVerifierTests.cs @@ -56,7 +56,7 @@ public async Task CheckBatchAccountQuotasAsync_ProviderReturnsNull_ThrowsExcepti services.BatchQuotaProvider.Setup(p => p.GetQuotaForRequirementAsync(It.IsAny(), It.IsAny(), It.IsAny(), It.IsAny())) .ReturnsAsync(batchVmFamilyQuotas); - await batchQuotaVerifier.CheckBatchAccountQuotasAsync(vmInfo, true, true, System.Threading.CancellationToken.None); + await batchQuotaVerifier.CheckBatchAccountQuotasAsync(vmInfo, true, System.Threading.CancellationToken.None); logger.Verify(l => l.LogError(It.IsAny(), It.IsAny()), Times.Once); @@ -99,7 +99,7 @@ public static async Task SetupAndCheckBatchAccountQuotasAsync(int requestedNumbe services.AzureProxy.Setup(p => p.GetBatchActivePoolCount()).Returns(activePoolCount); services.BatchSkuInformationProvider.Setup(p => p.GetVmSizesAndPricesAsync(Region, It.IsAny())).ReturnsAsync(CreateBatchSupportedVmSkuList(10)); - await batchQuotaVerifier.CheckBatchAccountQuotasAsync(vmInfo, true, true, System.Threading.CancellationToken.None); + await batchQuotaVerifier.CheckBatchAccountQuotasAsync(vmInfo, true, System.Threading.CancellationToken.None); } private static List CreateBatchSupportedVmSkuList(int maxNumberOfCores) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 0f394ef4f..29b32cbf7 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -32,7 +32,6 @@ namespace TesApi.Tests [TestClass] public class BatchSchedulerTests { - [TestCategory("Batch Pools")] [TestMethod] public async Task LocalPoolCacheAccessesNewPoolsAfterAllPoolsRemovedWithSameKey() { @@ -52,7 +51,6 @@ public async Task LocalPoolCacheAccessesNewPoolsAfterAllPoolsRemovedWithSameKey( Assert.AreSame(pool, pool1); } - [TestCategory("Batch Pools")] [TestMethod] public async Task GetOrAddDoesNotAddExistingAvailablePool() { @@ -74,7 +72,6 @@ public async Task GetOrAddDoesNotAddExistingAvailablePool() serviceProvider.AzureProxy.Verify(mock => mock.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny()), Times.Once); } - [TestCategory("Batch Pools")] [TestMethod] public async Task GetOrAddDoesAddWithExistingUnavailablePool() { @@ -97,7 +94,6 @@ public async Task GetOrAddDoesAddWithExistingUnavailablePool() } - [TestCategory("Batch Pools")] [TestMethod] public async Task TryGetReturnsTrueAndCorrectPool() { @@ -112,7 +108,6 @@ public async Task TryGetReturnsTrueAndCorrectPool() Assert.AreEqual(info.Pool.PoolId, pool.Pool.PoolId); } - [TestCategory("Batch Pools")] [TestMethod] public async Task TryGetReturnsFalseWhenPoolIdNotPresent() { @@ -125,7 +120,6 @@ public async Task TryGetReturnsFalseWhenPoolIdNotPresent() Assert.IsFalse(result); } - [TestCategory("Batch Pools")] [TestMethod] public async Task TryGetReturnsFalseWhenNoPoolIsAvailable() { @@ -139,7 +133,6 @@ public async Task TryGetReturnsFalseWhenNoPoolIsAvailable() Assert.IsFalse(result); } - [TestCategory("Batch Pools")] [TestMethod] public Task TryGetReturnsFalseWhenPoolIdIsNull() { @@ -152,7 +145,6 @@ public Task TryGetReturnsFalseWhenPoolIdIsNull() return Task.CompletedTask; } - [TestCategory("Batch Pools")] [TestMethod] public async Task UnavailablePoolsAreRemoved() { @@ -207,28 +199,6 @@ public async Task BackendParametersVmSizeShallOverrideVmSelection() Assert.AreEqual(TesState.SYSTEMERROREnum, state); } - [TestCategory("TES 1.1")] - [TestMethod] - public async Task BackendParametersWorkflowExecutionIdentityRequiresManualPool() - { - var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - azureProxyReturnValues.BatchJobAndTaskState = new() { JobState = null }; - - var task = GetTesTask(); - task.Resources.BackendParameters = new() - { - { "workflow_execution_identity", "/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/coa/providers/Microsoft.ManagedIdentity/userAssignedIdentities/coa-test-uami" } - }; - - (_, _, var poolInformation, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(task, GetMockConfig(true)(), GetMockAzureProxy(azureProxyReturnValues), AzureProxyReturnValues.Defaults); - - GuardAssertsWithTesTask(task, () => - { - Assert.IsNull(poolInformation.AutoPoolSpecification); - Assert.IsFalse(string.IsNullOrWhiteSpace(poolInformation.PoolId)); - }); - } - [TestCategory("TES 1.1")] [DataRow("VmSizeLowPri1", true)] @@ -242,7 +212,7 @@ public async Task TestIfVmSizeIsAvailable(string vmSize, bool preemptible) task.Resources.Preemptible = preemptible; task.Resources.BackendParameters = new() { { "vm_size", vmSize } }; - var config = GetMockConfig(false)(); + var config = GetMockConfig()(); using var serviceProvider = GetServiceProvider( config, GetMockAzureProxy(AzureProxyReturnValues.Defaults), @@ -327,28 +297,11 @@ public async Task TesTaskRemainsQueuedWhenBatchQuotaIsTemporarilyUnavailable() azureProxyReturnValues.BatchQuotas = new() { ActiveJobAndJobScheduleQuota = 1, PoolQuota = 1, DedicatedCoreQuota = 9, LowPriorityCoreQuota = 17 }; - azureProxyReturnValues.ActiveNodeCountByVmSize = new List { - new() { VirtualMachineSize = "VmSize1", DedicatedNodeCount = 4, LowPriorityNodeCount = 8 } // 8 (4 * 2) dedicated and 16 (8 * 2) low pri cores are in use, there is no more room for 2 cores - }; - - // The actual CPU core count (2) of the selected VM is used for quota calculation, not the TesResources CpuCores requirement - Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(new TesResources { CpuCores = 1, RamGb = 1, Preemptible = false }, azureProxyReturnValues)); - Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(new TesResources { CpuCores = 1, RamGb = 1, Preemptible = true }, azureProxyReturnValues)); - azureProxyReturnValues.ActiveNodeCountByVmSize = new List { new() { VirtualMachineSize = "VmSize1", DedicatedNodeCount = 4, LowPriorityNodeCount = 7 } // 8 dedicated and 14 low pri cores are in use }; Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(new TesResources { CpuCores = 1, RamGb = 1, Preemptible = true }, azureProxyReturnValues)); - - var dedicatedCoreQuotaPerVMFamily = new List { new("VmFamily1", 9) }; - azureProxyReturnValues.BatchQuotas = new() { ActiveJobAndJobScheduleQuota = 1, PoolQuota = 1, DedicatedCoreQuota = 100, LowPriorityCoreQuota = 17, DedicatedCoreQuotaPerVMFamilyEnforced = true, DedicatedCoreQuotaPerVMFamily = dedicatedCoreQuotaPerVMFamily }; - - azureProxyReturnValues.ActiveNodeCountByVmSize = new List { - new() { VirtualMachineSize = "VmSize1", DedicatedNodeCount = 4, LowPriorityNodeCount = 8 } // 8 (4 * 2) dedicated and 16 (8 * 2) low pri cores are in use, there is no more room for 2 cores - }; - - Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(new TesResources { CpuCores = 1, RamGb = 1, Preemptible = false }, azureProxyReturnValues)); } private async Task AddBatchTaskHandlesExceptions(TesState newState, Func, Action>)> testArranger, Action> resultValidator) @@ -366,7 +319,7 @@ private async Task AddBatchTaskHandlesExceptions(TesState newState, Func @@ -640,12 +593,11 @@ void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception except } } - [TestCategory("Batch Pools")] [TestMethod] public async Task BatchJobContainsExpectedBatchPoolInformation() { var tesTask = GetTesTask(); - var config = GetMockConfig(false)(); + var config = GetMockConfig()(); using var serviceProvider = GetServiceProvider( config, GetMockAzureProxy(AzureProxyReturnValues.Defaults), @@ -675,48 +627,12 @@ public async Task BatchJobContainsExpectedBatchPoolInformation() }); } - [TestMethod] - public async Task BatchJobContainsExpectedAutoPoolInformation() - { - (_, _, var poolInformation, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(true); - - Assert.IsNull(poolInformation.PoolId); - Assert.IsNotNull(poolInformation.AutoPoolSpecification); - Assert.AreEqual("TES", poolInformation.AutoPoolSpecification.AutoPoolIdPrefix); - Assert.AreEqual("VmSizeDedicated1", poolInformation.AutoPoolSpecification.PoolSpecification.VirtualMachineSize); - Assert.AreEqual(1, poolInformation.AutoPoolSpecification.PoolSpecification.TargetDedicatedComputeNodes); - Assert.AreEqual(1, poolInformation.AutoPoolSpecification.PoolSpecification.VirtualMachineConfiguration.ContainerConfiguration.ContainerRegistries.Count); - } - - [TestCategory("TES 1.1")] - [TestMethod] - public async Task BatchJobContainsExpectedManualPoolInformation() - { - var task = GetTesTask(); - task.Resources.BackendParameters = new() - { - { "workflow_execution_identity", "/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/coa/providers/Microsoft.ManagedIdentity/userAssignedIdentities/coa-test-uami" } - }; - - (_, _, var poolInformation, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(task, GetMockConfig(true)(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); - - GuardAssertsWithTesTask(task, () => - { - Assert.IsNotNull(poolInformation.PoolId); - Assert.IsNull(poolInformation.AutoPoolSpecification); - Assert.AreEqual("TES_JobId-1", poolInformation.PoolId); - Assert.AreEqual("VmSizeDedicated1", pool.VmSize); - Assert.AreEqual(1, pool.ScaleSettings.FixedScale.TargetDedicatedNodes); - Assert.AreEqual(1, pool.DeploymentConfiguration.VirtualMachineConfiguration.ContainerConfiguration.ContainerRegistries.Count); - }); - } - [TestMethod] public async Task NewTesTaskGetsScheduledSuccessfully() { var tesTask = GetTesTask(); - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(true)(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); GuardAssertsWithTesTask(tesTask, () => Assert.AreEqual(TesState.INITIALIZINGEnum, tesTask.State)); } @@ -727,13 +643,13 @@ public async Task PreemptibleTesTaskGetsScheduledToLowPriorityVm() var tesTask = GetTesTask(); tesTask.Resources.Preemptible = true; - (_, _, var poolInformation, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(true)(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); + (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); GuardAssertsWithTesTask(tesTask, () => { - Assert.AreEqual("VmSizeLowPri1", poolInformation.AutoPoolSpecification.PoolSpecification.VirtualMachineSize); - Assert.AreEqual(1, poolInformation.AutoPoolSpecification.PoolSpecification.TargetLowPriorityComputeNodes); - Assert.AreEqual(0, poolInformation.AutoPoolSpecification.PoolSpecification.TargetDedicatedComputeNodes); + Assert.AreEqual("VmSizeLowPri1", pool.VmSize); + Assert.IsTrue(pool.ScaleSettings.AutoScale.Formula.Contains("\n$TargetLowPriorityNodes = ")); + Assert.IsFalse(pool.ScaleSettings.AutoScale.Formula.Contains("\n$TargetDedicated = ")); }); } @@ -743,13 +659,13 @@ public async Task NonPreemptibleTesTaskGetsScheduledToDedicatedVm() var tesTask = GetTesTask(); tesTask.Resources.Preemptible = false; - (_, _, var poolInformation, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(true)(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); + (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); GuardAssertsWithTesTask(tesTask, () => { - Assert.AreEqual("VmSizeDedicated1", poolInformation.AutoPoolSpecification.PoolSpecification.VirtualMachineSize); - Assert.AreEqual(1, poolInformation.AutoPoolSpecification.PoolSpecification.TargetDedicatedComputeNodes); - Assert.AreEqual(0, poolInformation.AutoPoolSpecification.PoolSpecification.TargetLowPriorityComputeNodes); + Assert.AreEqual("VmSizeDedicated1", pool.VmSize); + Assert.IsTrue(pool.ScaleSettings.AutoScale.Formula.Contains("\n$TargetDedicated = ")); + Assert.IsFalse(pool.ScaleSettings.AutoScale.Formula.Contains("\n$TargetLowPriorityNodes = ")); }); } @@ -759,13 +675,13 @@ public async Task PreemptibleTesTaskGetsScheduledToLowPriorityVm_PerVMFamilyEnfo var tesTask = GetTesTask(); tesTask.Resources.Preemptible = true; - (_, _, var poolInformation, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(true)(), GetMockAzureProxy(AzureProxyReturnValues.DefaultsPerVMFamilyEnforced), AzureProxyReturnValues.DefaultsPerVMFamilyEnforced); + (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.DefaultsPerVMFamilyEnforced), AzureProxyReturnValues.DefaultsPerVMFamilyEnforced); GuardAssertsWithTesTask(tesTask, () => { - Assert.AreEqual("VmSizeLowPri1", poolInformation.AutoPoolSpecification.PoolSpecification.VirtualMachineSize); - Assert.AreEqual(1, poolInformation.AutoPoolSpecification.PoolSpecification.TargetLowPriorityComputeNodes); - Assert.AreEqual(0, poolInformation.AutoPoolSpecification.PoolSpecification.TargetDedicatedComputeNodes); + Assert.AreEqual("VmSizeLowPri1", pool.VmSize); + Assert.IsTrue(pool.ScaleSettings.AutoScale.Formula.Contains("\n$TargetLowPriorityNodes = ")); + Assert.IsFalse(pool.ScaleSettings.AutoScale.Formula.Contains("\n$TargetDedicated = ")); }); } @@ -775,13 +691,13 @@ public async Task NonPreemptibleTesTaskGetsScheduledToDedicatedVm_PerVMFamilyEnf var tesTask = GetTesTask(); tesTask.Resources.Preemptible = false; - (_, _, var poolInformation, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(true)(), GetMockAzureProxy(AzureProxyReturnValues.DefaultsPerVMFamilyEnforced), AzureProxyReturnValues.DefaultsPerVMFamilyEnforced); + (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.DefaultsPerVMFamilyEnforced), AzureProxyReturnValues.DefaultsPerVMFamilyEnforced); GuardAssertsWithTesTask(tesTask, () => { - Assert.AreEqual("VmSizeDedicated1", poolInformation.AutoPoolSpecification.PoolSpecification.VirtualMachineSize); - Assert.AreEqual(1, poolInformation.AutoPoolSpecification.PoolSpecification.TargetDedicatedComputeNodes); - Assert.AreEqual(0, poolInformation.AutoPoolSpecification.PoolSpecification.TargetLowPriorityComputeNodes); + Assert.AreEqual("VmSizeDedicated1", pool.VmSize); + Assert.IsTrue(pool.ScaleSettings.AutoScale.Formula.Contains("\n$TargetDedicated = ")); + Assert.IsFalse(pool.ScaleSettings.AutoScale.Formula.Contains("\n$TargetLowPriorityNodes = ")); }); } @@ -795,12 +711,12 @@ public async Task NonPreemptibleTesTaskGetsWarningAndIsScheduledToLowPriorityVmI var azureProxyReturnValues = AzureProxyReturnValues.DefaultsPerVMFamilyEnforced; azureProxyReturnValues.VmSizesAndPrices.First(vm => vm.VmSize.Equals("VmSize3", StringComparison.OrdinalIgnoreCase)).PricePerHour = 44; - (_, _, var poolInformation, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(true)(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); + (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); GuardAssertsWithTesTask(tesTask, () => { Assert.IsTrue(tesTask.Logs.Any(l => "UsedLowPriorityInsteadOfDedicatedVm".Equals(l.Warning))); - Assert.AreEqual(1, poolInformation.AutoPoolSpecification.PoolSpecification.TargetLowPriorityComputeNodes); + Assert.IsTrue(pool.ScaleSettings.AutoScale.Formula.Contains("\n$TargetLowPriorityNodes = ")); }); } @@ -810,12 +726,12 @@ public async Task TesTaskGetsScheduledToLowPriorityVmIfSettingUsePreemptibleVmsO var tesTask = GetTesTask(); tesTask.Resources.Preemptible = false; - var config = GetMockConfig(true)() + var config = GetMockConfig()() .Append(("BatchScheduling:UsePreemptibleVmsOnly", "true")); - (_, _, var poolInformation, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, config, GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); + (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, config, GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); - GuardAssertsWithTesTask(tesTask, () => Assert.AreEqual(1, poolInformation.AutoPoolSpecification.PoolSpecification.TargetLowPriorityComputeNodes)); + GuardAssertsWithTesTask(tesTask, () => Assert.IsTrue(pool.ScaleSettings.AutoScale.Formula.Contains("\n$TargetLowPriorityNodes = "))); } [TestMethod] @@ -826,10 +742,10 @@ static async Task RunTest(string allowedVmSizes, TesState expectedTaskState, str var tesTask = GetTesTask(); tesTask.Resources.Preemptible = true; - var config = GetMockConfig(true)() + var config = GetMockConfig()() .Append(("AllowedVmSizes", allowedVmSizes)); - (_, _, var poolInformation, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, config, GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); + (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, config, GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); GuardAssertsWithTesTask(tesTask, () => { @@ -837,7 +753,7 @@ static async Task RunTest(string allowedVmSizes, TesState expectedTaskState, str if (expectedSelectedVmSize is not null) { - Assert.AreEqual(expectedSelectedVmSize, poolInformation.AutoPoolSpecification.PoolSpecification.VirtualMachineSize); + Assert.AreEqual(expectedSelectedVmSize, pool.VmSize); } }); } @@ -865,7 +781,7 @@ public async Task TaskStateTransitionsFromRunningState() Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.MoreThanOneJobFound)); Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.NodeDiskFull)); Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.ActiveJobWithMissingAutoPool)); - Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.NodePreempted)); + Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.NodePreempted)); } [TestMethod] @@ -883,7 +799,7 @@ public async Task TaskStateTransitionsFromInitializingState() Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.NodeAllocationFailed)); Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.ImageDownloadFailed)); Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.ActiveJobWithMissingAutoPool)); - Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.NodePreempted)); + Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.NodePreempted)); } [TestMethod] @@ -896,7 +812,7 @@ public async Task TaskStateTransitionsFromQueuedState() Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskFailed)); Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.MoreThanOneJobFound)); Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.NodeDiskFull)); - Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskNotFound)); + Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskNotFound)); } [TestMethod] @@ -963,7 +879,7 @@ public async Task TaskGetsCancelled() azureProxy = mock; }); - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(false)(), azureProxySetter, azureProxyReturnValues); + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, azureProxyReturnValues); GuardAssertsWithTesTask(tesTask, () => { @@ -999,7 +915,7 @@ public async Task SuccessfullyCompletedTaskContainsBatchNodeMetrics() azureProxyReturnValues.BatchJobAndTaskState = BatchJobAndTaskStates.TaskCompletedSuccessfully; azureProxyReturnValues.DownloadedBlobContent = metricsFileContent; - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(false)(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); GuardAssertsWithTesTask(tesTask, () => { @@ -1036,7 +952,7 @@ public async Task SuccessfullyCompletedTaskContainsCromwellResultCode() azureProxyReturnValues.DownloadedBlobContent = "2"; var azureProxy = GetMockAzureProxy(azureProxyReturnValues); - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(false)(), azureProxy, azureProxyReturnValues); + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxy, azureProxyReturnValues); GuardAssertsWithTesTask(tesTask, () => { @@ -1158,7 +1074,7 @@ public async Task QueryStringsAreRemovedFromLocalFilePathsWhenCommandScriptIsPro azureProxy = mock; }); - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(false)(), azureProxySetter, azureProxyReturnValues); + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, azureProxyReturnValues); var modifiedCommandScript = (string)azureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.UploadBlobAsync) && Guid.TryParseExact(Path.GetFileName(new Uri(i.Arguments[0].ToString()).AbsolutePath), "D", out _))?.Arguments[1]; var filesToDownload = GetFilesToDownload(azureProxy); @@ -1192,7 +1108,7 @@ public async Task QueryStringsAreRemovedFromLocalFilePathsWhenCommandScriptIsPro azureProxy = mock; }); - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(false)(), azureProxySetter, AzureProxyReturnValues.Defaults); + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, AzureProxyReturnValues.Defaults); var modifiedCommandScript = (string)azureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.UploadBlobAsync) && Guid.TryParseExact(Path.GetFileName(new Uri(i.Arguments[0].ToString()).AbsolutePath), "D", out _))?.Arguments[1]; var filesToDownload = GetFilesToDownload(azureProxy); @@ -1207,10 +1123,11 @@ public async Task QueryStringsAreRemovedFromLocalFilePathsWhenCommandScriptIsPro }); } + // TODO: Is this test still valid? [TestMethod] public async Task PublicHttpUrlsAreKeptIntact() { - var config = GetMockConfig(true)() + var config = GetMockConfig()() .Append(("Storage:ExternalStorageContainers", "https://externalaccount1.blob.core.windows.net/container1?sas1; https://externalaccount2.blob.core.windows.net/container2/?sas2; https://externalaccount2.blob.core.windows.net?accountsas;")); var tesTask = GetTesTask(); @@ -1243,10 +1160,11 @@ public async Task PublicHttpUrlsAreKeptIntact() }); } + // TODO: Is this test still valid? [TestMethod] public async Task PrivatePathsAndUrlsGetSasToken() { - var config = GetMockConfig(true)() + var config = GetMockConfig()() .Append(("Storage:ExternalStorageContainers", "https://externalaccount1.blob.core.windows.net/container1?sas1; https://externalaccount2.blob.core.windows.net/container2/?sas2; https://externalaccount2.blob.core.windows.net?accountsas;")); var tesTask = GetTesTask(); @@ -1325,13 +1243,13 @@ public async Task PrivateImagesArePulledUsingPoolConfiguration() GetMockAzureProxy(AzureProxyReturnValues.Defaults)(mock); azureProxy = mock; }); - (_, var cloudTask, var poolInformation, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(true)(), azureProxySetter, AzureProxyReturnValues.Defaults); + (_, var cloudTask, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, AzureProxyReturnValues.Defaults); var batchScript = (string)azureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.UploadBlobAsync) && i.Arguments[0].ToString().Contains("/batch_script"))?.Arguments[1]; GuardAssertsWithTesTask(tesTask, () => { - Assert.IsNotNull(poolInformation.AutoPoolSpecification.PoolSpecification.VirtualMachineConfiguration.ContainerConfiguration); - Assert.AreEqual("registryServer1.io", poolInformation.AutoPoolSpecification.PoolSpecification.VirtualMachineConfiguration.ContainerConfiguration.ContainerRegistries.FirstOrDefault()?.RegistryServer); + Assert.IsNotNull(pool.DeploymentConfiguration.VirtualMachineConfiguration.ContainerConfiguration); + Assert.AreEqual("registryServer1.io", pool.DeploymentConfiguration.VirtualMachineConfiguration.ContainerConfiguration.ContainerRegistries.FirstOrDefault()?.RegistryServer); Assert.AreEqual(2, Regex.Matches(batchScript, tesTask.Executors.First().Image, RegexOptions.IgnoreCase).Count); Assert.IsFalse(batchScript.Contains($"docker pull --quiet {tesTask.Executors.First().Image}")); }); @@ -1349,12 +1267,12 @@ public async Task PublicImagesArePulledInTaskCommand() GetMockAzureProxy(AzureProxyReturnValues.Defaults)(mock); azureProxy = mock; }); - (_, var cloudTask, var poolInformation, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(true)(), azureProxySetter, AzureProxyReturnValues.Defaults); + (_, var cloudTask, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, AzureProxyReturnValues.Defaults); var batchScript = (string)azureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.UploadBlobAsync) && i.Arguments[0].ToString().Contains("/batch_script"))?.Arguments[1]; GuardAssertsWithTesTask(tesTask, () => { - Assert.IsNull(poolInformation.AutoPoolSpecification.PoolSpecification.VirtualMachineConfiguration.ContainerConfiguration); + Assert.IsNull(pool.DeploymentConfiguration.VirtualMachineConfiguration.ContainerConfiguration); Assert.AreEqual(3, Regex.Matches(batchScript, tesTask.Executors.First().Image, RegexOptions.IgnoreCase).Count); Assert.IsTrue(batchScript.Contains("docker pull --quiet ubuntu")); }); @@ -1365,7 +1283,7 @@ public async Task PrivateContainersRunInsideDockerInDockerContainer() { var tesTask = GetTesTask(); - (_, var cloudTask, _, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(false)(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); + (_, var cloudTask, _, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); GuardAssertsWithTesTask(tesTask, () => { @@ -1380,7 +1298,7 @@ public async Task PublicContainersRunInsideRegularTaskCommand() var tesTask = GetTesTask(); tesTask.Executors.First().Image = "ubuntu"; - (_, var cloudTask, _, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(false)(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); + (_, var cloudTask, _, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); GuardAssertsWithTesTask(tesTask, () => Assert.IsNull(cloudTask.ContainerSettings)); } @@ -1414,7 +1332,7 @@ public async Task CromwellWriteFilesAreDiscoveredAndAddedIfMissedWithContentScri Uri executionDirectoryUri = default; - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(false)(), azureProxySetter, azureProxyReturnValues, serviceProviderActions: serviceProvider => + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, azureProxyReturnValues, serviceProviderActions: serviceProvider => { var storageAccessProvider = serviceProvider.GetServiceOrCreateInstance(); @@ -1492,7 +1410,7 @@ public async Task LocalFilesInCromwellTmpDirectoryAreDiscoveredAndUploaded() GetMockAzureProxy(azureProxyReturnValues)(mock); azureProxy = mock; }); - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(false)(), azureProxySetter, azureProxyReturnValues); + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, azureProxyReturnValues); var filesToDownload = GetFilesToDownload(azureProxy); @@ -1509,19 +1427,19 @@ public async Task LocalFilesInCromwellTmpDirectoryAreDiscoveredAndUploaded() [TestMethod] public async Task PoolIsCreatedInSubnetWhenBatchNodesSubnetIdIsSet() { - var config = GetMockConfig(true)() + var config = GetMockConfig()() .Append(("BatchNodes:SubnetId", "subnet1")); var tesTask = GetTesTask(); var azureProxy = GetMockAzureProxy(AzureProxyReturnValues.Defaults); - (_, _, var poolInformation, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, config, azureProxy, AzureProxyReturnValues.Defaults); + (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, config, azureProxy, AzureProxyReturnValues.Defaults); - var poolNetworkConfiguration = poolInformation.AutoPoolSpecification.PoolSpecification.NetworkConfiguration; + var poolNetworkConfiguration = pool.NetworkConfiguration; GuardAssertsWithTesTask(tesTask, () => { - Assert.AreEqual(Microsoft.Azure.Batch.Common.IPAddressProvisioningType.BatchManaged, poolNetworkConfiguration?.PublicIPAddressConfiguration?.Provision); + Assert.AreEqual(Microsoft.Azure.Management.Batch.Models.IPAddressProvisioningType.BatchManaged, poolNetworkConfiguration?.PublicIPAddressConfiguration?.Provision); Assert.AreEqual("subnet1", poolNetworkConfiguration?.SubnetId); }); } @@ -1529,20 +1447,20 @@ public async Task PoolIsCreatedInSubnetWhenBatchNodesSubnetIdIsSet() [TestMethod] public async Task PoolIsCreatedWithoutPublicIpWhenSubnetAndDisableBatchNodesPublicIpAddressAreSet() { - var config = GetMockConfig(true)() + var config = GetMockConfig()() .Append(("BatchNodes:SubnetId", "subnet1")) .Append(("BatchNodes:DisablePublicIpAddress", "true")); var tesTask = GetTesTask(); var azureProxy = GetMockAzureProxy(AzureProxyReturnValues.Defaults); - (_, _, var poolInformation, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, config, azureProxy, AzureProxyReturnValues.Defaults); + (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, config, azureProxy, AzureProxyReturnValues.Defaults); - var poolNetworkConfiguration = poolInformation.AutoPoolSpecification.PoolSpecification.NetworkConfiguration; + var poolNetworkConfiguration = pool.NetworkConfiguration; GuardAssertsWithTesTask(tesTask, () => { - Assert.AreEqual(Microsoft.Azure.Batch.Common.IPAddressProvisioningType.NoPublicIPAddresses, poolNetworkConfiguration?.PublicIPAddressConfiguration?.Provision); + Assert.AreEqual(Microsoft.Azure.Management.Batch.Models.IPAddressProvisioningType.NoPublicIPAddresses, poolNetworkConfiguration?.PublicIPAddressConfiguration?.Provision); Assert.AreEqual("subnet1", poolNetworkConfiguration?.SubnetId); }); } @@ -1570,13 +1488,13 @@ public async Task var azureProxyReturnValues = AzureProxyReturnValues.Defaults; azureProxyReturnValues.BatchJobAndTaskState = azureBatchJobAndTaskState ?? azureProxyReturnValues.BatchJobAndTaskState; - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(true)(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); return (tesTask.Logs?.LastOrDefault()?.FailureReason, tesTask.Logs?.LastOrDefault()?.SystemLogs?.ToArray()); } - private static Task<(string JobId, CloudTask CloudTask, PoolInformation PoolInformation, Pool batchModelsPool)> ProcessTesTaskAndGetBatchJobArgumentsAsync(bool autopool) - => ProcessTesTaskAndGetBatchJobArgumentsAsync(GetTesTask(), GetMockConfig(autopool)(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); + private static Task<(string JobId, CloudTask CloudTask, PoolInformation PoolInformation, Pool batchModelsPool)> ProcessTesTaskAndGetBatchJobArgumentsAsync() + => ProcessTesTaskAndGetBatchJobArgumentsAsync(GetTesTask(), GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); private static async Task<(string JobId, CloudTask CloudTask, PoolInformation PoolInformation, Pool batchModelsPool)> ProcessTesTaskAndGetBatchJobArgumentsAsync(TesTask tesTask, IEnumerable<(string Key, string Value)> configuration, Action> azureProxy, AzureProxyReturnValues azureProxyReturnValues, Action additionalActions = default, Action> serviceProviderActions = default) { @@ -1594,12 +1512,11 @@ public async Task await batchScheduler.ProcessTesTaskAsync(tesTask, System.Threading.CancellationToken.None); var createBatchPoolAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.CreateBatchPoolAsync)); - var createAutoPoolBatchJobAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.CreateAutoPoolModeBatchJobAsync)); var addBatchTaskAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.AddBatchTaskAsync)); - var jobId = (addBatchTaskAsyncInvocation?.Arguments[0] ?? createAutoPoolBatchJobAsyncInvocation?.Arguments[0]) as string; - var cloudTask = (addBatchTaskAsyncInvocation?.Arguments[1] ?? createAutoPoolBatchJobAsyncInvocation?.Arguments[1]) as CloudTask; - var poolInformation = (addBatchTaskAsyncInvocation?.Arguments[2] ?? createAutoPoolBatchJobAsyncInvocation?.Arguments[2]) as PoolInformation; + var jobId = (addBatchTaskAsyncInvocation?.Arguments[0]) as string; + var cloudTask = (addBatchTaskAsyncInvocation?.Arguments[1]) as CloudTask; + var poolInformation = (addBatchTaskAsyncInvocation?.Arguments[2]) as PoolInformation; var batchPoolsModel = createBatchPoolAsyncInvocation?.Arguments[0] as Pool; return (jobId, cloudTask, poolInformation, batchPoolsModel); @@ -1664,15 +1581,17 @@ private static Action> GetMockQuotaProvider(AzureProxy private static TestServices.TestServiceProvider GetServiceProvider(IEnumerable<(string Key, string Value)> configuration, Action> azureProxy, Action> quotaProvider, Action> skuInfoProvider, Action> containerRegistryProviderSetup, Action> allowedVmSizesServiceSetup, Action additionalActions = default) => new(wrapAzureProxy: true, configuration: configuration, azureProxy: azureProxy, batchQuotaProvider: quotaProvider, batchSkuInformationProvider: skuInfoProvider, accountResourceInformation: GetNewBatchResourceInfo(), containerRegistryProviderSetup: containerRegistryProviderSetup, allowedVmSizesServiceSetup: allowedVmSizesServiceSetup, additionalActions: additionalActions); + // TODO: Are these tests still valid? private static async Task GetNewTesTaskStateAsync(TesTask tesTask, AzureProxyReturnValues azureProxyReturnValues) { - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig(true)(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); + //_ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); + var test = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); return tesTask.State; } private static Task GetNewTesTaskStateAsync(TesState currentTesTaskState, AzureBatchJobAndTaskState azureBatchJobAndTaskState) - => GetNewTesTaskStateAsync(new TesTask { Id = "test", State = currentTesTaskState }, azureBatchJobAndTaskState); + => GetNewTesTaskStateAsync(new TesTask { Id = "test", State = currentTesTaskState, Executors = new() { new() { Image = "imageName1", Command = new() { "command" } } } }, azureBatchJobAndTaskState); private static Task GetNewTesTaskStateAsync(TesTask tesTask, AzureBatchJobAndTaskState? azureBatchJobAndTaskState = null) { @@ -1712,10 +1631,7 @@ private static Action> GetMockAzureProxy(AzureProxyReturnValue azureProxy.Setup(a => a.GetActivePoolsAsync(It.IsAny())) .Returns(AsyncEnumerable.Empty()); - azureProxy.Setup(a => a.GetNextBatchJobIdAsync(It.IsAny(), It.IsAny())) - .Returns(Task.FromResult(azureProxyReturnValues.NextBatchJobId)); - - azureProxy.Setup(a => a.GetBatchJobAndTaskStateAsync(It.IsAny(), It.IsAny(), It.IsAny())) + azureProxy.Setup(a => a.GetBatchJobAndTaskStateAsync(It.IsAny(), It.IsAny())) .Returns(Task.FromResult(azureProxyReturnValues.BatchJobAndTaskState)); azureProxy.Setup(a => a.GetStorageAccountInfoAsync("defaultstorageaccount", It.IsAny())) @@ -1768,7 +1684,7 @@ private static Action> GetMockAzureProxy(AzureProxyReturnValue .Returns(azureProxyReturnValues.AzureProxyListTasks); }; - private static Func> GetMockConfig(bool autopool) + private static Func> GetMockConfig() => new(() => { var config = Enumerable.Empty<(string Key, string Value)>() @@ -1784,10 +1700,6 @@ private static Action> GetMockAzureProxy(AzureProxyReturnValue .Append(("BatchImageGen2:Sku", "2004")) .Append(("BatchImageGen2:Version", "latest")) .Append(("BatchImageGen2:NodeAgentSkuId", "batch.node.ubuntu 20.04")); - if (autopool) - { - config = config.Append(("BatchScheduling:UseLegacyAutopools", "true")); - } return config; }); @@ -1810,7 +1722,7 @@ private static IEnumerable GetFilesToDownload(Mock private static TestServices.TestServiceProvider GetServiceProvider(AzureProxyReturnValues azureProxyReturn = default) { azureProxyReturn ??= AzureProxyReturnValues.Defaults; - var config = GetMockConfig(false)(); + var config = GetMockConfig()(); return new( wrapAzureProxy: true, accountResourceInformation: new("defaultbatchaccount", "defaultresourcegroup", "defaultsubscription", "defaultregion"), @@ -1823,7 +1735,7 @@ private static TestServices.TestServiceProvider GetServiceProvi private static TestServices.TestServiceProvider GetServiceProviderWithMockStorageProvider(AzureProxyReturnValues azureProxyReturn = default) { azureProxyReturn ??= AzureProxyReturnValues.Defaults; - var config = GetMockConfig(false)(); + var config = GetMockConfig()(); return new( wrapAzureProxy: true, mockStorageAccessProvider: true, @@ -1909,7 +1821,6 @@ private class AzureProxyReturnValues public int ActiveJobCount { get; set; } public int ActivePoolCount { get; set; } public AzureBatchJobAndTaskState BatchJobAndTaskState { get; set; } - public string NextBatchJobId { get; set; } public string StorageAccountKey { get; set; } public string DownloadedBlobContent { get; set; } public bool LocalFileExists { get; set; } @@ -1935,7 +1846,6 @@ private class AzureProxyReturnValues ActiveJobCount = 0, ActivePoolCount = 0, BatchJobAndTaskState = BatchJobAndTaskStates.JobNotFound, - NextBatchJobId = "JobId-1", StorageAccountKey = "Key1", DownloadedBlobContent = string.Empty, LocalFileExists = true @@ -1999,7 +1909,7 @@ private class TestBatchQuotaVerifierQuotaMaxedOut : TestBatchQuotaVerifierBase { public TestBatchQuotaVerifierQuotaMaxedOut(IBatchQuotaProvider batchQuotaProvider) : base(batchQuotaProvider) { } - public override Task CheckBatchAccountQuotasAsync(VirtualMachineInformation _1, bool _2, bool _3, System.Threading.CancellationToken cancellationToken) + public override Task CheckBatchAccountQuotasAsync(VirtualMachineInformation _1, bool _2, System.Threading.CancellationToken cancellationToken) => throw new AzureBatchQuotaMaxedOutException("Test AzureBatchQuotaMaxedOutException"); } @@ -2007,7 +1917,7 @@ private class TestBatchQuotaVerifierLowQuota : TestBatchQuotaVerifierBase { public TestBatchQuotaVerifierLowQuota(IBatchQuotaProvider batchQuotaProvider) : base(batchQuotaProvider) { } - public override Task CheckBatchAccountQuotasAsync(VirtualMachineInformation _1, bool _2, bool _3, System.Threading.CancellationToken cancellationToken) + public override Task CheckBatchAccountQuotasAsync(VirtualMachineInformation _1, bool _2, System.Threading.CancellationToken cancellationToken) => throw new AzureBatchLowQuotaException("Test AzureBatchLowQuotaException"); } @@ -2018,7 +1928,7 @@ private abstract class TestBatchQuotaVerifierBase : IBatchQuotaVerifier protected TestBatchQuotaVerifierBase(IBatchQuotaProvider batchQuotaProvider) => this.batchQuotaProvider = batchQuotaProvider; - public abstract Task CheckBatchAccountQuotasAsync(VirtualMachineInformation virtualMachineInformation, bool needPoolOrJobQuotaCheck, bool needCoresUtilizationQuotaCheck, System.Threading.CancellationToken cancellationToken); + public abstract Task CheckBatchAccountQuotasAsync(VirtualMachineInformation virtualMachineInformation, bool needPoolOrJobQuotaCheck, System.Threading.CancellationToken cancellationToken); public IBatchQuotaProvider GetBatchQuotaProvider() => batchQuotaProvider; diff --git a/src/TesApi.Tests/DeleteCompletedBatchJobsHostedServiceTests.cs b/src/TesApi.Tests/DeleteCompletedBatchJobsHostedServiceTests.cs deleted file mode 100644 index 3600cc66b..000000000 --- a/src/TesApi.Tests/DeleteCompletedBatchJobsHostedServiceTests.cs +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System; -using System.Linq; -using System.Threading.Tasks; -using Microsoft.VisualStudio.TestTools.UnitTesting; -using Moq; -using Tes.Models; -using Tes.Repository; -using TesApi.Web; - -namespace TesApi.Tests -{ - [TestClass] - public class DeleteCompletedBatchJobsHostedServiceTests - { - private static readonly TimeSpan oldestJobAge = TimeSpan.FromDays(7); - - [TestMethod] - public async Task DeleteCompletedBatchJobs_DeletesJobs_TesStateCompleted() - { - // Arrange & Act - var firstTesTask = new TesTask { Id = "tesTaskId1", State = TesState.COMPLETEEnum }; - var secondTesTask = new TesTask { Id = "tesTaskId2", State = TesState.RUNNINGEnum }; - var thirdTesTask = new TesTask { Id = "tesTaskId3", State = TesState.INITIALIZINGEnum }; - var azureProxy = await ArrangeTest(new[] { firstTesTask, secondTesTask, thirdTesTask }); - - // Assert - azureProxy.Verify(i => i.ListOldJobsToDeleteAsync(oldestJobAge, It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchJobAsync("tesTaskId1", It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchPoolIfExistsAsync("tesTaskId1", It.IsAny())); - azureProxy.VerifyNoOtherCalls(); - } - - [TestMethod] - public async Task DeleteCompletedBatchJobs_DeletesJobs_TesStateError() - { - // Arrange & Act - var firstTesTask = new TesTask { Id = "tesTaskId1", State = TesState.SYSTEMERROREnum }; - var secondTesTask = new TesTask { Id = "tesTaskId2", State = TesState.EXECUTORERROREnum }; - var thirdTesTask = new TesTask { Id = "tesTaskId3", State = TesState.PAUSEDEnum }; - var azureProxy = await ArrangeTest(new[] { firstTesTask, secondTesTask, thirdTesTask }); - - // Assert - azureProxy.Verify(i => i.ListOldJobsToDeleteAsync(oldestJobAge, It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchJobAsync("tesTaskId1", It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchJobAsync("tesTaskId2", It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchPoolIfExistsAsync("tesTaskId1", It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchPoolIfExistsAsync("tesTaskId2", It.IsAny())); - azureProxy.VerifyNoOtherCalls(); - } - - [TestMethod] - public async Task DeleteCompletedBatchJobs_DeletesJobs_TesStateCanceled() - { - // Arrange & Act - var firstTesTask = new TesTask { Id = "tesTaskId1", State = TesState.CANCELEDEnum }; - var secondTesTask = new TesTask { Id = "tesTaskId2", State = TesState.QUEUEDEnum }; - var thirdTesTask = new TesTask { Id = "tesTaskId3", State = TesState.INITIALIZINGEnum }; - var azureProxy = await ArrangeTest(new[] { firstTesTask, secondTesTask, thirdTesTask }); - - // Assert - azureProxy.Verify(i => i.ListOldJobsToDeleteAsync(oldestJobAge, It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchJobAsync("tesTaskId1", It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchPoolIfExistsAsync("tesTaskId1", It.IsAny())); - azureProxy.VerifyNoOtherCalls(); - } - - [TestMethod] - public async Task DeleteCompletedBatchJobs_DeletesJobs_TesStateUnknown() - { - // Arrange & Act - var firstTesTask = new TesTask { Id = "tesTaskId1", State = TesState.UNKNOWNEnum }; - var secondTesTask = new TesTask { Id = "tesTaskId2", State = TesState.PAUSEDEnum }; - var thirdTesTask = new TesTask { Id = "tesTaskId3", State = TesState.RUNNINGEnum }; - var azureProxy = await ArrangeTest(new[] { firstTesTask, secondTesTask, thirdTesTask }); - - // Assert - azureProxy.Verify(i => i.ListOldJobsToDeleteAsync(oldestJobAge, It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchJobAsync("tesTaskId1", It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchPoolIfExistsAsync("tesTaskId1", It.IsAny())); - azureProxy.VerifyNoOtherCalls(); - } - - private async Task> ArrangeTest(TesTask[] tasks) - { - void SetupRepository(Mock> mockRepo) - { - foreach (var item in tasks) - { - mockRepo.Setup(repo => repo.TryGetItemAsync(item.Id, It.IsAny(), It.IsAny>())) - .Callback>((id, _1, action) => - { - action(item); - }) - .ReturnsAsync(true); - } - } - - using var serviceProvider = new TestServices.TestServiceProvider( - configuration: Enumerable.Repeat(("BatchScheduling:UseLegacyAutopools", true.ToString()), 1), - azureProxy: a => a.Setup(p => p.ListOldJobsToDeleteAsync(oldestJobAge, It.IsAny())) - .ReturnsAsync(tasks.Select(i => i.Id + "-1")), - tesTaskRepository: SetupRepository); - - await serviceProvider.GetT().StartAsync(new System.Threading.CancellationToken()); - return serviceProvider.AzureProxy; - } - } -} diff --git a/src/TesApi.Tests/DeleteOrphanedBatchJobsHostedServiceTests.cs b/src/TesApi.Tests/DeleteOrphanedBatchJobsHostedServiceTests.cs deleted file mode 100644 index 188a1e14e..000000000 --- a/src/TesApi.Tests/DeleteOrphanedBatchJobsHostedServiceTests.cs +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System; -using System.Linq; -using System.Threading.Tasks; -using Microsoft.VisualStudio.TestTools.UnitTesting; -using Moq; -using Tes.Models; -using Tes.Repository; -using TesApi.Web; - -namespace TesApi.Tests -{ - [TestClass] - public class DeleteOrphanedBatchJobsHostedServiceTests - { - private static readonly TimeSpan minJobAge = TimeSpan.FromHours(1); - - [TestMethod] - public async Task DeleteOrphanedBatchJobs_DeletesJobs_TesStateCompleted() - { - // Arrange & Act - var firstTesTask = new TesTask { Id = "tesTaskId1", State = TesState.COMPLETEEnum }; - var secondTesTask = new TesTask { Id = "tesTaskId2", State = TesState.RUNNINGEnum }; - var thirdTesTask = new TesTask { Id = "tesTaskId3", State = TesState.INITIALIZINGEnum }; - var azureProxy = await ArrangeTest(new[] { firstTesTask, secondTesTask, thirdTesTask }); - - // Assert - azureProxy.Verify(i => i.ListOrphanedJobsToDeleteAsync(minJobAge, It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchJobAsync("tesTaskId1", It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchPoolIfExistsAsync("tesTaskId1", It.IsAny())); - azureProxy.VerifyNoOtherCalls(); - } - - [TestMethod] - public async Task DeleteOrphanedBatchJobs_DeletesJobs_TesStateError() - { - // Arrange & Act - var firstTesTask = new TesTask { Id = "tesTaskId1", State = TesState.SYSTEMERROREnum }; - var secondTesTask = new TesTask { Id = "tesTaskId2", State = TesState.EXECUTORERROREnum }; - var thirdTesTask = new TesTask { Id = "tesTaskId3", State = TesState.PAUSEDEnum }; - var azureProxy = await ArrangeTest(new[] { firstTesTask, secondTesTask, thirdTesTask }); - - // Assert - azureProxy.Verify(i => i.ListOrphanedJobsToDeleteAsync(minJobAge, It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchJobAsync("tesTaskId1", It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchJobAsync("tesTaskId2", It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchPoolIfExistsAsync("tesTaskId1", It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchPoolIfExistsAsync("tesTaskId2", It.IsAny())); - azureProxy.VerifyNoOtherCalls(); - } - - [TestMethod] - public async Task DeleteOrphanedBatchJobs_DeletesJobs_TesStateCanceled() - { - // Arrange & Act - var firstTesTask = new TesTask { Id = "tesTaskId1", State = TesState.CANCELEDEnum }; - var secondTesTask = new TesTask { Id = "tesTaskId2", State = TesState.QUEUEDEnum }; - var thirdTesTask = new TesTask { Id = "tesTaskId3", State = TesState.INITIALIZINGEnum }; - var azureProxy = await ArrangeTest(new[] { firstTesTask, secondTesTask, thirdTesTask }); - - // Assert - azureProxy.Verify(i => i.ListOrphanedJobsToDeleteAsync(minJobAge, It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchJobAsync("tesTaskId1", It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchPoolIfExistsAsync("tesTaskId1", It.IsAny())); - azureProxy.VerifyNoOtherCalls(); - } - - [TestMethod] - public async Task DeleteOrphanedBatchJobs_DeletesJobs_TesStateUnknown() - { - // Arrange & Act - var firstTesTask = new TesTask { Id = "tesTaskId1", State = TesState.UNKNOWNEnum }; - var secondTesTask = new TesTask { Id = "tesTaskId2", State = TesState.PAUSEDEnum }; - var thirdTesTask = new TesTask { Id = "tesTaskId3", State = TesState.RUNNINGEnum }; - var azureProxy = await ArrangeTest(new[] { firstTesTask, secondTesTask, thirdTesTask }); - - // Assert - azureProxy.Verify(i => i.ListOrphanedJobsToDeleteAsync(minJobAge, It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchJobAsync("tesTaskId1", It.IsAny())); - azureProxy.Verify(i => i.DeleteBatchPoolIfExistsAsync("tesTaskId1", It.IsAny())); - azureProxy.VerifyNoOtherCalls(); - } - - private static async Task> ArrangeTest(TesTask[] tasks) - { - void SetupRepository(Mock> mockRepo) - { - foreach (var item in tasks) - { - mockRepo.Setup(repo => repo.TryGetItemAsync(item.Id, It.IsAny(), It.IsAny>())) - .Callback>((id, _1, action) => - { - action(item); - }) - .ReturnsAsync(true); - } - } - - using var services = new TestServices.TestServiceProvider( - configuration: Enumerable.Repeat(("BatchScheduling:UseLegacyAutopools", true.ToString()), 1), - azureProxy: a => a.Setup(p => p.ListOrphanedJobsToDeleteAsync(minJobAge, It.IsAny())).ReturnsAsync(tasks.Select(i => i.Id + "-1")), - tesTaskRepository: SetupRepository); - - var deleteOrphanedBatchJobsHostedService = services.GetT(); - - await deleteOrphanedBatchJobsHostedService.StartAsync(new System.Threading.CancellationToken()); - - return services.AzureProxy; - } - } -} diff --git a/src/TesApi.Tests/DeleteOrphanedBatchPoolsHostedServiceTests.cs b/src/TesApi.Tests/DeleteOrphanedBatchPoolsHostedServiceTests.cs deleted file mode 100644 index 9b3cce92e..000000000 --- a/src/TesApi.Tests/DeleteOrphanedBatchPoolsHostedServiceTests.cs +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Threading; -using System.Threading.Tasks; -using Microsoft.VisualStudio.TestTools.UnitTesting; -using Moq; -using TesApi.Web; - -namespace TesApi.Tests -{ - [TestClass] - public class DeleteOrphanedBatchPoolsHostedServiceTests - { - [TestMethod] - public async Task DeleteOrphanedAutoPoolsServiceOnlyDeletesPoolsNotReferencedByJobs() - { - var activePoolIds = new List { "1", "2", "3" }; - var poolIdsReferencedByJobs = new List { "3", "4" }; - - using var services = new TestServices.TestServiceProvider( - configuration: Enumerable.Repeat(("BatchScheduling:UseLegacyAutopools", true.ToString()), 1), - azureProxy: a => - { - a.Setup(p => p.GetActivePoolIdsAsync(It.IsAny(), It.IsAny(), It.IsAny())).ReturnsAsync(activePoolIds); - a.Setup(p => p.GetPoolIdsReferencedByJobsAsync(It.IsAny())).ReturnsAsync(poolIdsReferencedByJobs); - }); - - var deleteCompletedBatchJobsHostedService = services.GetT(); - - await deleteCompletedBatchJobsHostedService.StartAsync(default); - - services.AzureProxy.Verify(i => i.GetActivePoolIdsAsync("TES_", TimeSpan.FromMinutes(30), It.IsAny())); - services.AzureProxy.Verify(i => i.GetPoolIdsReferencedByJobsAsync(It.IsAny())); - services.AzureProxy.Verify(i => i.DeleteBatchPoolAsync("1", It.IsAny()), Times.Once); - services.AzureProxy.Verify(i => i.DeleteBatchPoolAsync("2", It.IsAny()), Times.Once); - services.AzureProxy.VerifyNoOtherCalls(); - } - } -} diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index f4a5e61d1..96ada7d27 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -21,7 +21,6 @@ using Microsoft.Extensions.Options; using Microsoft.Rest; using Microsoft.WindowsAzure.Storage.Blob; -using Newtonsoft.Json; using Polly; using Polly.Retry; using Tes.Models; @@ -164,24 +163,6 @@ public static async Task GetAppInsightsConnectionStringAsync(string appI return null; } - /// - public async Task GetNextBatchJobIdAsync(string tesTaskId, CancellationToken cancellationToken) - { - var jobFilter = new ODATADetailLevel - { - FilterClause = $"startswith(id,'{tesTaskId}{BatchJobAttemptSeparator}')", - SelectClause = "id" - }; - - var lastAttemptNumber = await batchClient.JobOperations.ListJobs(jobFilter) - .ToAsyncEnumerable() - .Select(j => int.Parse(j.Id.Split(BatchJobAttemptSeparator)[1])) - .OrderBy(a => a) - .LastOrDefaultAsync(cancellationToken); - - return $"{tesTaskId}{BatchJobAttemptSeparator}{lastAttemptNumber + 1}"; - } - /// public IEnumerable GetBatchActiveNodeCountByVmSize() => batchClient.PoolOperations.ListPools() @@ -218,44 +199,6 @@ public int GetBatchActiveJobCount() return batchClient.JobOperations.ListJobs(activeJobsFilter).ToAsyncEnumerable().CountAsync(CancellationToken.None).AsTask().Result; } - /// - public async Task CreateAutoPoolModeBatchJobAsync(string jobId, CloudTask cloudTask, PoolInformation poolInformation, CancellationToken cancellationToken) - { - logger.LogInformation($"TES task: {cloudTask.Id} - creating Batch job"); - var job = batchClient.JobOperations.CreateJob(jobId, poolInformation); - job.OnAllTasksComplete = OnAllTasksComplete.TerminateJob; - await job.CommitAsync(cancellationToken: cancellationToken); - logger.LogInformation($"TES task: {cloudTask.Id} - Batch job committed successfully."); - await Task.Delay(TimeSpan.FromSeconds(2), cancellationToken); - - try - { - logger.LogInformation($"TES task: {cloudTask.Id} adding task to job."); - job = await batchRaceConditionJobNotFoundRetryPolicy.ExecuteAsync(ct => - batchClient.JobOperations.GetJobAsync(job.Id, cancellationToken: ct), - cancellationToken); - - await job.AddTaskAsync(cloudTask, cancellationToken: cancellationToken); - logger.LogInformation($"TES task: {cloudTask.Id} added task successfully."); - } - catch (Exception ex) when (ex is not OperationCanceledException) - { - var batchError = JsonConvert.SerializeObject((ex as BatchException)?.RequestInformation?.BatchError); - logger.LogError(ex, $"TES task: {cloudTask.Id} deleting {job.Id} because adding task to it failed. Batch error: {batchError}"); - - try - { - await batchClient.JobOperations.DeleteJobAsync(job.Id, cancellationToken: cancellationToken); - } - catch (Exception e) - { - logger.LogError(e, $"TES task: {cloudTask.Id} deleting {job.Id} failed."); - } - - throw; - } - } - /// public async Task CreateBatchJobAsync(PoolInformation poolInformation, CancellationToken cancellationToken) { @@ -295,11 +238,10 @@ public async Task DeleteBatchJobAsync(PoolInformation poolInformation, Cancellat /// [System.Diagnostics.CodeAnalysis.SuppressMessage("Performance", "CA1826:Do not use Enumerable methods on indexable collections", Justification = "FirstOrDefault() is straightforward, the alternative is less clear.")] - public async Task GetBatchJobAndTaskStateAsync(TesTask tesTask, bool usingAutoPools, CancellationToken cancellationToken) + public async Task GetBatchJobAndTaskStateAsync(TesTask tesTask, CancellationToken cancellationToken) { try { - var nodeAllocationFailed = false; string nodeErrorCode = null; IEnumerable nodeErrorDetails = null; var activeJobWithMissingAutoPool = false; @@ -317,80 +259,43 @@ public async Task GetBatchJobAndTaskStateAsync(TesTas SelectClause = "*" }; - if (usingAutoPools) + if (string.IsNullOrWhiteSpace(tesTask.PoolId)) { - // Normally, we will only find one job. If we find more, we always want the latest one. Thus, we use ListJobs() - var jobInfos = await batchClient.JobOperations.ListJobs(jobOrTaskFilter).ToAsyncEnumerable() - .Select(j => new { Job = j, AttemptNumber = int.Parse(j.Id.Split(BatchJobAttemptSeparator)[1]) }) - .ToListAsync(cancellationToken); - - if (!jobInfos.Any()) - { - return new AzureBatchJobAndTaskState { JobState = null }; - } - - if (jobInfos.Count(j => j.Job.State == JobState.Active) > 1) - { - return new AzureBatchJobAndTaskState { MoreThanOneActiveJobOrTaskFound = true }; - } + return new AzureBatchJobAndTaskState { JobState = null }; + } - var lastJobInfo = jobInfos.OrderBy(j => j.AttemptNumber).Last(); + try + { + job = await batchClient.JobOperations.GetJobAsync(tesTask.PoolId, cancellationToken: cancellationToken); + } + catch (BatchException ex) when (ex.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException e && e.Response.StatusCode == System.Net.HttpStatusCode.NotFound) + { + logger.LogError(ex, @"Failed to get job for TesTask {TesTask}", tesTask.Id); + return new AzureBatchJobAndTaskState { JobState = null }; + } - job = lastJobInfo.Job; - attemptNumber = lastJobInfo.AttemptNumber; + var taskInfos = await batchClient.JobOperations.ListTasks(tesTask.PoolId, jobOrTaskFilter).ToAsyncEnumerable() + .Select(t => new { Task = t, AttemptNumber = int.Parse(t.Id.Split(BatchJobAttemptSeparator)[1]) }) + .ToListAsync(cancellationToken); - try - { - batchTask = await batchClient.JobOperations.GetTaskAsync(job.Id, tesTask.Id, cancellationToken: cancellationToken); - } - catch (Exception ex) - { - logger.LogError(ex, @"Failed to get task for TesTask {TesTask}", tesTask.Id); - } + if (!taskInfos.Any()) + { + logger.LogError(@"Failed to get task for TesTask {TesTask}", tesTask.Id); } else { - if (string.IsNullOrWhiteSpace(tesTask.PoolId)) + if (taskInfos.Count(t => t.Task.State != TaskState.Completed) > 1) { - return new AzureBatchJobAndTaskState { JobState = null }; - } - - try - { - job = await batchClient.JobOperations.GetJobAsync(tesTask.PoolId, cancellationToken: cancellationToken); - } - catch (BatchException ex) when (ex.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException e && e.Response.StatusCode == System.Net.HttpStatusCode.NotFound) - { - logger.LogError(ex, @"Failed to get job for TesTask {TesTask}", tesTask.Id); - return new AzureBatchJobAndTaskState { JobState = null }; - } - - var taskInfos = await batchClient.JobOperations.ListTasks(tesTask.PoolId, jobOrTaskFilter).ToAsyncEnumerable() - .Select(t => new { Task = t, AttemptNumber = int.Parse(t.Id.Split(BatchJobAttemptSeparator)[1]) }) - .ToListAsync(cancellationToken); - - if (!taskInfos.Any()) - { - logger.LogError(@"Failed to get task for TesTask {TesTask}", tesTask.Id); + return new AzureBatchJobAndTaskState { MoreThanOneActiveJobOrTaskFound = true }; } - else - { - if (taskInfos.Count(t => t.Task.State != TaskState.Completed) > 1) - { - return new AzureBatchJobAndTaskState { MoreThanOneActiveJobOrTaskFound = true }; - } - var lastTaskInfo = taskInfos.OrderBy(t => t.AttemptNumber).Last(); - batchTask = lastTaskInfo.Task; - attemptNumber = lastTaskInfo.AttemptNumber; - } + var lastTaskInfo = taskInfos.OrderBy(t => t.AttemptNumber).Last(); + batchTask = lastTaskInfo.Task; + attemptNumber = lastTaskInfo.AttemptNumber; } poolId = job.ExecutionInformation?.PoolId; - Func computeNodePredicate = usingAutoPools - ? n => (n.RecentTasks?.Select(t => t.JobId) ?? Enumerable.Empty()).Contains(job.Id) - : n => (n.RecentTasks?.Select(t => t.TaskId) ?? Enumerable.Empty()).Contains(batchTask?.Id); var nodeId = string.Empty; @@ -414,9 +319,7 @@ public async Task GetBatchJobAndTaskStateAsync(TesTas if (pool is not null) { - nodeAllocationFailed = usingAutoPools && pool.ResizeErrors?.Count > 0; // When not using autopools, NodeAllocationFailed will be determined in BatchScheduler.GetBatchTaskStateAsync() - - var node = await pool.ListComputeNodes().ToAsyncEnumerable().FirstOrDefaultAsync(computeNodePredicate, cancellationToken); + var node = await pool.ListComputeNodes().ToAsyncEnumerable().FirstOrDefaultAsync(n => (n.RecentTasks?.Select(t => t.TaskId) ?? Enumerable.Empty()).Contains(batchTask?.Id), cancellationToken); if (node is not null) { @@ -447,7 +350,6 @@ public async Task GetBatchJobAndTaskStateAsync(TesTas MoreThanOneActiveJobOrTaskFound = false, ActiveJobWithMissingAutoPool = activeJobWithMissingAutoPool, AttemptNumber = attemptNumber, - NodeAllocationFailed = nodeAllocationFailed, NodeErrorCode = nodeErrorCode, NodeErrorDetails = nodeErrorDetails, NodeState = nodeState, diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index 05bbc627e..ef1b41772 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -20,7 +20,6 @@ public class BatchPoolService : BackgroundService { private readonly IBatchScheduler _batchScheduler; private readonly ILogger _logger; - private readonly bool _isDisabled; /// /// Interval between each call to . @@ -30,26 +29,13 @@ public class BatchPoolService : BackgroundService /// /// Default constructor /// - /// Configuration of /// /// /// - public BatchPoolService(IOptions batchSchedulingOptions, IBatchScheduler batchScheduler, ILogger logger) + public BatchPoolService(IBatchScheduler batchScheduler, ILogger logger) { _batchScheduler = batchScheduler ?? throw new ArgumentNullException(nameof(batchScheduler)); _logger = logger ?? throw new ArgumentNullException(nameof(logger)); - _isDisabled = batchSchedulingOptions.Value.UseLegacyAutopools; - } - - /// - public override Task StartAsync(CancellationToken cancellationToken) - { - if (_isDisabled) - { - return Task.CompletedTask; - } - - return base.StartAsync(cancellationToken); } /// diff --git a/src/TesApi.Web/BatchScheduler.BatchPools.cs b/src/TesApi.Web/BatchScheduler.BatchPools.cs index f399e2b62..1bd1292bd 100644 --- a/src/TesApi.Web/BatchScheduler.BatchPools.cs +++ b/src/TesApi.Web/BatchScheduler.BatchPools.cs @@ -114,11 +114,6 @@ internal bool IsPoolAvailable(string key) internal async Task GetOrAddPoolAsync(string key, bool isPreemptable, ModelPoolFactory modelPoolFactory, CancellationToken cancellationToken) { - if (enableBatchAutopool) - { - return default; - } - ArgumentNullException.ThrowIfNull(modelPoolFactory); var keyLength = key?.Length ?? 0; if (keyLength > PoolKeyLength || keyLength < 1) @@ -184,22 +179,19 @@ public async ValueTask FlushPoolsAsync(IEnumerable assignedPools, Cancel try { - if (!this.enableBatchAutopool) + var pools = (await batchPools.GetAllPools() + .ToAsyncEnumerable() + .WhereAwait(async p => await p.CanBeDeleted(cancellationToken)) + .ToListAsync(cancellationToken)) + .Where(p => !assignedPools.Contains(p.Pool.PoolId)) + .OrderBy(p => p.GetAllocationStateTransitionTime(cancellationToken)) + .Take(neededPools.Count) + .ToList(); + + foreach (var pool in pools) { - var pools = (await batchPools.GetAllPools() - .ToAsyncEnumerable() - .WhereAwait(async p => await p.CanBeDeleted(cancellationToken)) - .ToListAsync(cancellationToken)) - .Where(p => !assignedPools.Contains(p.Pool.PoolId)) - .OrderBy(p => p.GetAllocationStateTransitionTime(cancellationToken)) - .Take(neededPools.Count) - .ToList(); - - foreach (var pool in pools) - { - await DeletePoolAsync(pool, cancellationToken); - _ = RemovePoolFromList(pool); - } + await DeletePoolAsync(pool, cancellationToken); + _ = RemovePoolFromList(pool); } } finally diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 855621d55..09d45c344 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -71,7 +71,6 @@ public partial class BatchScheduler : IBatchScheduler private readonly bool usePreemptibleVmsOnly; private readonly string batchNodesSubnetId; private readonly bool disableBatchNodesPublicIpAddress; - private readonly bool enableBatchAutopool; private readonly TimeSpan poolLifetime; private readonly BatchNodeInfo gen2BatchNodeInfo; private readonly BatchNodeInfo gen1BatchNodeInfo; @@ -147,8 +146,7 @@ public BatchScheduler( this.cromwellDrsLocalizerImageName = marthaOptions.Value.CromwellDrsLocalizer; if (string.IsNullOrWhiteSpace(this.cromwellDrsLocalizerImageName)) { this.cromwellDrsLocalizerImageName = Options.MarthaOptions.DefaultCromwellDrsLocalizer; } this.disableBatchNodesPublicIpAddress = batchNodesOptions.Value.DisablePublicIpAddress; - this.enableBatchAutopool = batchSchedulingOptions.Value.UseLegacyAutopools; - this.poolLifetime = this.enableBatchAutopool ? TimeSpan.Zero : TimeSpan.FromDays(batchSchedulingOptions.Value.PoolRotationForcedDays == 0 ? Options.BatchSchedulingOptions.DefaultPoolRotationForcedDays : batchSchedulingOptions.Value.PoolRotationForcedDays); + this.poolLifetime = TimeSpan.FromDays(batchSchedulingOptions.Value.PoolRotationForcedDays == 0 ? Options.BatchSchedulingOptions.DefaultPoolRotationForcedDays : batchSchedulingOptions.Value.PoolRotationForcedDays); this.defaultStorageAccountName = storageOptions.Value.DefaultAccountName; this.marthaUrl = marthaOptions.Value.Url; this.marthaKeyVaultName = marthaOptions.Value.KeyVaultName; @@ -157,14 +155,11 @@ public BatchScheduler( this.globalManagedIdentity = batchNodesOptions.Value.GlobalManagedIdentity; this.allowedVmSizesService = allowedVmSizesService; - if (!this.enableBatchAutopool) - { - _batchPoolFactory = poolFactory; - batchPrefix = batchSchedulingOptions.Value.Prefix; - logger.LogInformation("BatchPrefix: {BatchPrefix}", batchPrefix); - taskRunScriptContent = File.ReadAllLines(Path.Combine(AppContext.BaseDirectory, "scripts/task-run.sh")); - taskCleanupScriptContent = File.ReadAllLines(Path.Combine(AppContext.BaseDirectory, "scripts/clean-executor.sh")); - } + _batchPoolFactory = poolFactory; + batchPrefix = batchSchedulingOptions.Value.Prefix; + logger.LogInformation("BatchPrefix: {BatchPrefix}", batchPrefix); + taskRunScriptContent = File.ReadAllLines(Path.Combine(AppContext.BaseDirectory, "scripts/task-run.sh")); + taskCleanupScriptContent = File.ReadAllLines(Path.Combine(AppContext.BaseDirectory, "scripts/clean-executor.sh")); this.gen2BatchNodeInfo = new BatchNodeInfo { @@ -242,7 +237,7 @@ async Task SetTaskSystemError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, async Task DeleteBatchJobAndSetTaskStateAsync(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await DeleteBatchJobOrTaskAsync(tesTask, batchInfo.Pool, cancellationToken); + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, batchInfo.Pool, cancellationToken); await azureProxy.DeleteBatchPoolIfExistsAsync(tesTask.Id, cancellationToken); SetTaskStateAndLog(tesTask, newTaskState, batchInfo); } @@ -263,29 +258,23 @@ Task AddSystemLogAndDeleteBatchJobAndSetTaskExecutorErrorAsync(TesTask tesTask, async Task CancelTaskAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await DeleteBatchJobOrTaskAsync(tesTask, batchInfo.Pool, cancellationToken); + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, batchInfo.Pool, cancellationToken); await azureProxy.DeleteBatchPoolIfExistsAsync(tesTask.Id, cancellationToken); tesTask.IsCancelRequested = false; } Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - if (enableBatchAutopool) - { - return DeleteBatchJobAndRequeueTaskAsync(tesTask, batchInfo, cancellationToken); - } - else - { - logger.LogInformation("The TesTask {TesTask}'s node was preempted. It will be automatically rescheduled.", tesTask.Id); - return Task.FromResult(false); - } + logger.LogInformation("The TesTask {TesTask}'s node was preempted. It will be automatically rescheduled.", tesTask.Id); + tesTask.State = TesState.INITIALIZINGEnum; + return Task.FromResult(false); } tesTaskStateTransitions = new List() { new TesTaskStateTransition(tesTaskCancellationRequested, batchTaskState: null, alternateSystemLogItem: null, CancelTaskAsync), new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.JobNotFound, alternateSystemLogItem: null, (tesTask, _, ct) => AddBatchTaskAsync(tesTask, ct)), - new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.MissingBatchTask, alternateSystemLogItem: null, (tesTask, batchInfo, ct) => enableBatchAutopool ? DeleteBatchJobAndRequeueTaskAsync(tesTask, batchInfo, ct) : AddBatchTaskAsync(tesTask, ct)), + new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.MissingBatchTask, alternateSystemLogItem: null, (tesTask, batchInfo, ct) => AddBatchTaskAsync(tesTask, ct)), new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.Initializing, alternateSystemLogItem: null, (tesTask, _) => tesTask.State = TesState.INITIALIZINGEnum), new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, BatchTaskState.NodeAllocationFailed, alternateSystemLogItem: null, DeleteBatchJobAndRequeueTaskAsync), new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, BatchTaskState.Running, alternateSystemLogItem: null, (tesTask, _) => tesTask.State = TesState.RUNNINGEnum), @@ -301,9 +290,6 @@ Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, }; } - private Task DeleteBatchJobOrTaskAsync(TesTask tesTask, PoolInformation poolInformation, CancellationToken cancellationToken) - => enableBatchAutopool ? azureProxy.DeleteBatchJobAsync(tesTask.Id, cancellationToken) : poolInformation is null || poolInformation.PoolId is null ? WarnWhenUnableToFindPoolToDeleteTask(tesTask) : azureProxy.DeleteBatchTaskAsync(tesTask.Id, poolInformation, cancellationToken); - private Task WarnWhenUnableToFindPoolToDeleteTask(TesTask tesTask) { logger.LogWarning("Unable to delete batch task for task {TesTask} because of missing pool/job information.", tesTask.Id); @@ -317,7 +303,7 @@ private async Task DeleteBatchTaskAndOrJobAndOrPoolIfExists(IAzureProxy azurePro try { - await DeleteBatchJobOrTaskAsync(tesTask, batchInfo.Pool, cancellationToken); + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, batchInfo.Pool, cancellationToken); } catch (Exception exc) { @@ -325,19 +311,6 @@ private async Task DeleteBatchTaskAndOrJobAndOrPoolIfExists(IAzureProxy azurePro batchDeletionExceptions.Add(exc); } - if (enableBatchAutopool) - { - try - { - await azureProxy.DeleteBatchPoolIfExistsAsync(tesTask.Id, cancellationToken); - } - catch (Exception exc) - { - logger.LogError(exc, $"Exception deleting batch pool with tesTask.Id: {tesTask?.Id}"); - batchDeletionExceptions.Add(exc); - } - } - if (batchDeletionExceptions.Any()) { throw new AggregateException(batchDeletionExceptions); @@ -370,19 +343,16 @@ public IAsyncEnumerable GetCloudPools(CancellationToken cancellationT /// public async Task LoadExistingPoolsAsync(CancellationToken cancellationToken) { - if (!enableBatchAutopool) + await foreach (var cloudPool in GetCloudPools(cancellationToken).WithCancellation(cancellationToken)) { - await foreach (var cloudPool in GetCloudPools(cancellationToken).WithCancellation(cancellationToken)) + try { - try - { - var batchPool = _batchPoolFactory.CreateNew(); - await batchPool.AssignPoolAsync(cloudPool, cancellationToken); - } - catch (Exception exc) - { - logger.LogError(exc, "When retrieving previously created batch pools and jobs, there were one or more failures when trying to access batch pool {PoolId} or its associated job.", cloudPool.Id); - } + var batchPool = _batchPoolFactory.CreateNew(); + await batchPool.AssignPoolAsync(cloudPool, cancellationToken); + } + catch (Exception exc) + { + logger.LogError(exc, "When retrieving previously created batch pools and jobs, there were one or more failures when trying to access batch pool {PoolId} or its associated job.", cloudPool.Id); } } } @@ -530,8 +500,8 @@ private async Task AddBatchTaskAsync(TesTask tesTask, CancellationToken cancella var virtualMachineInfo = await GetVmSizeAsync(tesTask, cancellationToken); var containerMetadata = await GetContainerConfigurationIfNeededAsync(tesTask, cancellationToken); - (poolKey, var displayName) = enableBatchAutopool ? default : GetPoolKey(tesTask, virtualMachineInfo, containerMetadata.ContainerConfiguration, cancellationToken); - await quotaVerifier.CheckBatchAccountQuotasAsync(virtualMachineInfo, needPoolOrJobQuotaCheck: enableBatchAutopool || !IsPoolAvailable(poolKey), needCoresUtilizationQuotaCheck: enableBatchAutopool, cancellationToken: cancellationToken); + (poolKey, var displayName) = GetPoolKey(tesTask, virtualMachineInfo, containerMetadata.ContainerConfiguration, cancellationToken); + await quotaVerifier.CheckBatchAccountQuotasAsync(virtualMachineInfo, needPoolOrJobQuotaCheck: !IsPoolAvailable(poolKey), cancellationToken: cancellationToken); var tesTaskLog = tesTask.AddTesTaskLog(); tesTaskLog.VirtualMachineInfo = virtualMachineInfo; @@ -548,58 +518,30 @@ private async Task AddBatchTaskAsync(TesTask tesTask, CancellationToken cancella } var useGen2 = virtualMachineInfo.HyperVGenerations?.Contains("V2"); - string jobOrTaskId = default; - if (enableBatchAutopool) - { - jobOrTaskId = await azureProxy.GetNextBatchJobIdAsync(tesTask.Id, cancellationToken); - poolInformation = await CreateAutoPoolModePoolInformation( - poolSpecification: await GetPoolSpecification( - vmSize: virtualMachineInfo.VmSize, - autoscaled: false, - preemptable: virtualMachineInfo.LowPriority, - nodeInfo: useGen2.GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo, - containerConfiguration: containerMetadata.ContainerConfiguration, - encryptionAtHostSupported: virtualMachineInfo.EncryptionAtHostSupported, - cancellationToken: cancellationToken), - tesTaskId: tesTask.Id, - jobId: jobOrTaskId, - cancellationToken: cancellationToken, - identityResourceIds: identities); - } - else - { - poolInformation = (await GetOrAddPoolAsync( - key: poolKey, - isPreemptable: virtualMachineInfo.LowPriority, - modelPoolFactory: async (id, ct) => ConvertPoolSpecificationToModelsPool( - name: id, - displayName: displayName, - poolIdentity: GetBatchPoolIdentity(identities.ToArray()), - pool: await GetPoolSpecification( - vmSize: virtualMachineInfo.VmSize, - autoscaled: true, - preemptable: virtualMachineInfo.LowPriority, - nodeInfo: useGen2.GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo, - containerConfiguration: containerMetadata.ContainerConfiguration, - encryptionAtHostSupported: virtualMachineInfo.EncryptionAtHostSupported, - cancellationToken: ct)), - cancellationToken: cancellationToken) - ).Pool; - jobOrTaskId = $"{tesTask.Id}-{tesTask.Logs.Count}"; - } - + poolInformation = (await GetOrAddPoolAsync( + key: poolKey, + isPreemptable: virtualMachineInfo.LowPriority, + modelPoolFactory: async (id, ct) => ConvertPoolSpecificationToModelsPool( + name: id, + displayName: displayName, + poolIdentity: GetBatchPoolIdentity(identities.ToArray()), + pool: await GetPoolSpecification( + vmSize: virtualMachineInfo.VmSize, + autoscaled: true, + preemptable: virtualMachineInfo.LowPriority, + nodeInfo: useGen2.GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo, + containerConfiguration: containerMetadata.ContainerConfiguration, + encryptionAtHostSupported: virtualMachineInfo.EncryptionAtHostSupported, + cancellationToken: ct)), + cancellationToken: cancellationToken) + ).Pool; + + var jobOrTaskId = $"{tesTask.Id}-{tesTask.Logs.Count}"; tesTask.PoolId = poolInformation.PoolId; - var cloudTask = await ConvertTesTaskToBatchTaskAsync(enableBatchAutopool ? tesTask.Id : jobOrTaskId, tesTask, containerMetadata.IsPublic, cancellationToken); - logger.LogInformation($"Creating batch task for TES task {tesTask.Id}. Using VM size {virtualMachineInfo.VmSize}."); + var cloudTask = await ConvertTesTaskToBatchTaskAsync(jobOrTaskId, tesTask, containerMetadata.IsPublic, cancellationToken); - if (enableBatchAutopool) - { - await azureProxy.CreateAutoPoolModeBatchJobAsync(jobOrTaskId, cloudTask, poolInformation, cancellationToken); - } - else - { - await azureProxy.AddBatchTaskAsync(tesTask.Id, cloudTask, poolInformation, cancellationToken); - } + logger.LogInformation($"Creating batch task for TES task {tesTask.Id}. Using VM size {virtualMachineInfo.VmSize}."); + await azureProxy.AddBatchTaskAsync(tesTask.Id, cloudTask, poolInformation, cancellationToken); tesTaskLog.StartTime = DateTimeOffset.UtcNow; tesTask.State = TesState.INITIALIZINGEnum; @@ -616,13 +558,6 @@ private async Task AddBatchTaskAsync(TesTask tesTask, CancellationToken cancella { HandleException(exception); } - finally - { - if (enableBatchAutopool && poolInformation?.AutoPoolSpecification is not null) - { - await azureProxy.DeleteBatchPoolIfExistsAsync(tesTask.Id, cancellationToken); - } - } void HandleException(Exception exception) { @@ -718,12 +653,7 @@ void HandleException(Exception exception) [System.Diagnostics.CodeAnalysis.SuppressMessage("Performance", "CA1826:Do not use Enumerable methods on indexable collections", Justification = "FirstOrDefault() is straightforward, the alternative is less clear.")] private async ValueTask GetBatchTaskStateAsync(TesTask tesTask, CancellationToken cancellationToken) { - var azureBatchJobAndTaskState = await azureProxy.GetBatchJobAndTaskStateAsync(tesTask, enableBatchAutopool, cancellationToken); - - if (enableBatchAutopool) - { - tesTask.PoolId ??= azureBatchJobAndTaskState.Pool?.PoolId; - } + var azureBatchJobAndTaskState = await azureProxy.GetBatchJobAndTaskStateAsync(tesTask, cancellationToken); if (azureBatchJobAndTaskState.Pool?.PoolId is null) { @@ -771,29 +701,22 @@ static IEnumerable ConvertNodeErrorsToSystemLogItems(AzureBatchJobAndTas // Because a ComputeTask is not assigned to the compute node while the StartTask is running, IAzureProxy.GetBatchJobAndTaskStateAsync() does not see start task failures. Deal with that here. if (azureBatchJobAndTaskState.NodeState is null && azureBatchJobAndTaskState.JobState == JobState.Active && azureBatchJobAndTaskState.TaskState == TaskState.Active && !string.IsNullOrWhiteSpace(azureBatchJobAndTaskState.Pool?.PoolId)) { - if (enableBatchAutopool) - { - _ = ProcessStartTaskFailure((await azureProxy.ListComputeNodesAsync(azureBatchJobAndTaskState.Pool.PoolId, new ODATADetailLevel { FilterClause = "state eq 'starttaskfailed'", SelectClause = "id,startTaskInfo" }).FirstOrDefaultAsync(cancellationToken: cancellationToken))?.StartTaskInformation?.FailureInformation); - } - else + /* + * Priority order for assigning errors to TesTasks in shared-pool mode: + * 1. Node error found in GetBatchJobAndTaskStateAsync() + * 2. StartTask failure + * 3. NodeAllocation failure + */ + if (TryGetPool(azureBatchJobAndTaskState.Pool.PoolId, out var pool)) { - /* - * Priority order for assigning errors to TesTasks in shared-pool mode: - * 1. Node error found in GetBatchJobAndTaskStateAsync() - * 2. StartTask failure - * 3. NodeAllocation failure - */ - if (TryGetPool(azureBatchJobAndTaskState.Pool.PoolId, out var pool)) + if (!string.IsNullOrWhiteSpace(azureBatchJobAndTaskState.NodeErrorCode) || !ProcessStartTaskFailure(pool.PopNextStartTaskFailure())) { - if (!string.IsNullOrWhiteSpace(azureBatchJobAndTaskState.NodeErrorCode) || !ProcessStartTaskFailure(pool.PopNextStartTaskFailure())) + var resizeError = pool.PopNextResizeError(); + if (resizeError is not null) { - var resizeError = pool.PopNextResizeError(); - if (resizeError is not null) - { - azureBatchJobAndTaskState.NodeAllocationFailed = true; - azureBatchJobAndTaskState.NodeErrorCode = resizeError.Code; - azureBatchJobAndTaskState.NodeErrorDetails = Enumerable.Repeat(resizeError.Message, string.IsNullOrWhiteSpace(resizeError.Message) ? 1 : 0).Concat(resizeError.Values?.Select(d => d.Value) ?? Enumerable.Empty()); - } + azureBatchJobAndTaskState.NodeAllocationFailed = true; + azureBatchJobAndTaskState.NodeErrorCode = resizeError.Code; + azureBatchJobAndTaskState.NodeErrorDetails = Enumerable.Repeat(resizeError.Message, string.IsNullOrWhiteSpace(resizeError.Message) ? 1 : 0).Concat(resizeError.Values?.Select(d => d.Value) ?? Enumerable.Empty()); } } } @@ -1180,9 +1103,7 @@ private async Task ConvertTesTaskToBatchTaskAsync(string taskId, TesT await storageAccessProvider.UploadBlobAsync(new Uri(nodeBatchScriptSasUrl), sb.ToString(), cancellationToken); - var batchRunCommand = enableBatchAutopool - ? $"/bin/bash -c {CreateWgetDownloadCommand(nodeBatchScriptSasUrl, $"AZ_BATCH_TASK_WORKING_DIR/{BatchScriptFileName}", setExecutable: true)} && AZ_BATCH_TASK_WORKING_DIR/{BatchScriptFileName}" - : $"/bin/bash -c \"{MungeBatchTaskCommandLine()}\""; + var batchRunCommand = $"/bin/bash -c \"{MungeBatchTaskCommandLine()}\""; // Replace any URL query strings with the word REMOVED const string pattern = @"(https?:\/\/[^?\s]+)\?[^?\s]*"; @@ -1367,9 +1288,7 @@ private async Task StartTaskIfNeeded(VirtualMachineConfiguration mach var globalStartTaskConfigured = !string.IsNullOrWhiteSpace(globalStartTaskPath); var startTaskSasUrl = globalStartTaskConfigured - ? enableBatchAutopool - ? await storageAccessProvider.MapLocalPathToSasUrlAsync(globalStartTaskPath, cancellationToken) - : await storageAccessProvider.MapLocalPathToSasUrlAsync(globalStartTaskPath, cancellationToken, sasTokenDuration: BatchPoolService.RunInterval.Multiply(2).Add(poolLifetime).Add(TimeSpan.FromMinutes(15))) + ? await storageAccessProvider.MapLocalPathToSasUrlAsync(globalStartTaskPath, cancellationToken, sasTokenDuration: BatchPoolService.RunInterval.Multiply(2).Add(poolLifetime).Add(TimeSpan.FromMinutes(15))) : default; if (startTaskSasUrl is not null) @@ -1513,59 +1432,6 @@ var s when s.StartsWith("batch.node.centos ") => "sudo yum install epel-release }, (executorImageIsPublic, dockerInDockerIsPublic, cromwellDrsIsPublic)); } - /// - /// Constructs either an or a new pool in the batch account ready for a job to be attached. - /// - /// - /// - /// - /// A for controlling the lifetime of the asynchronous operation. - /// - /// If is provided, must also be provided.
This method does not support autscaled pools.
- /// An - private async Task CreateAutoPoolModePoolInformation(PoolSpecification poolSpecification, string tesTaskId, string jobId, CancellationToken cancellationToken, IEnumerable identityResourceIds = null) - { - var identities = identityResourceIds?.ToArray() ?? Array.Empty(); - var isAutoPool = !identities.Any(); - - if (isAutoPool) - { - logger.LogInformation($"TES task: {tesTaskId} creating Auto Pool using VM size {poolSpecification.VirtualMachineSize}"); - } - else - { - logger.LogInformation($"TES task: {tesTaskId} creating Manual Batch Pool using VM size {poolSpecification.VirtualMachineSize}"); - } - - // By default, the pool will have the same name/ID as the job if the identity is provided, otherwise we return an actual autopool. - return isAutoPool - ? new() - { - AutoPoolSpecification = new() - { - AutoPoolIdPrefix = "TES", - PoolLifetimeOption = PoolLifetimeOption.Job, - PoolSpecification = poolSpecification, - KeepAlive = false - } - } - : await azureProxy.CreateBatchPoolAsync( - ConvertPoolSpecificationToModelsPool( - $"TES_{jobId}", - jobId, - GetBatchPoolIdentity(identities), - poolSpecification), - IsPreemptable(), cancellationToken); - - bool IsPreemptable() - => true switch - { - _ when poolSpecification.TargetDedicatedComputeNodes > 0 => false, - _ when poolSpecification.TargetLowPriorityComputeNodes > 0 => true, - _ => throw new ArgumentException("Unable to determine if pool will host a low priority compute node.", nameof(poolSpecification)), - }; - } - /// /// Generate the BatchPoolIdentity object /// diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 17b7fbc6f..d228de624 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -65,9 +65,6 @@ public CachingWithRetriesAzureProxy(IAzureProxy azureProxy, IOptions - public Task CreateAutoPoolModeBatchJobAsync(string jobId, CloudTask cloudTask, PoolInformation poolInformation, CancellationToken cancellationToken) => azureProxy.CreateAutoPoolModeBatchJobAsync(jobId, cloudTask, poolInformation, cancellationToken); - /// public async Task CreateBatchJobAsync(PoolInformation poolInformation, CancellationToken cancellationToken) { @@ -138,10 +135,7 @@ public async Task DeleteBatchComputeNodesAsync(string poolId, IEnumerable cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetBatchActivePoolCount()); /// - public Task GetBatchJobAndTaskStateAsync(Tes.Models.TesTask tesTask, bool usingAutoPools, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAndTaskStateAsync(tesTask, usingAutoPools, ct), cancellationToken); - - /// - public Task GetNextBatchJobIdAsync(string tesTaskId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetNextBatchJobIdAsync(tesTaskId, ct), cancellationToken); + public Task GetBatchJobAndTaskStateAsync(Tes.Models.TesTask tesTask, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAndTaskStateAsync(tesTask, ct), cancellationToken); /// public Task> GetPoolIdsReferencedByJobsAsync(CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(azureProxy.GetPoolIdsReferencedByJobsAsync, cancellationToken); diff --git a/src/TesApi.Web/DeleteCompletedBatchJobsHostedService.cs b/src/TesApi.Web/DeleteCompletedBatchJobsHostedService.cs deleted file mode 100644 index 876844c24..000000000 --- a/src/TesApi.Web/DeleteCompletedBatchJobsHostedService.cs +++ /dev/null @@ -1,132 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System; -using System.Threading; -using System.Threading.Tasks; -using Microsoft.Extensions.Hosting; -using Microsoft.Extensions.Logging; -using Microsoft.Extensions.Options; -using Tes.Models; -using Tes.Repository; - -namespace TesApi.Web -{ - /// - /// Background service to delete Batch jobs older than seven days for completed tasks - /// - public class DeleteCompletedBatchJobsHostedService : BackgroundService - { - private static readonly TimeSpan oldestJobAge = TimeSpan.FromDays(7); - private readonly IRepository repository; - private readonly IAzureProxy azureProxy; - private readonly ILogger logger; - private readonly bool isDisabled; - - /// - /// Default constructor - /// - /// Configuration of - /// Azure Proxy - /// The main TES task database repository - /// The logger instance - public DeleteCompletedBatchJobsHostedService(IOptions batchSchedulingOptions, IAzureProxy azureProxy, IRepository repository, ILogger logger) - { - this.repository = repository; - this.azureProxy = azureProxy; - this.logger = logger; - this.isDisabled = batchSchedulingOptions.Value.DisableJobCleanup || !batchSchedulingOptions.Value.UseLegacyAutopools; - } - - /// - public override Task StartAsync(CancellationToken cancellationToken) - { - if (isDisabled) - { - return Task.CompletedTask; - } - - return base.StartAsync(cancellationToken); - } - - /// - public override Task StopAsync(CancellationToken cancellationToken) - { - logger.LogInformation("Batch Job cleanup stopping..."); - return base.StopAsync(cancellationToken); - } - - /// - /// The job clean up service that checks for old jobs on the Batch account that are safe to delete - /// - /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. - /// A System.Threading.Tasks.Task that represents the long running operations. - protected override async Task ExecuteAsync(CancellationToken cancellationToken) - { - var runInterval = TimeSpan.FromDays(1); - logger.LogInformation("Batch Job cleanup started."); - - while (!cancellationToken.IsCancellationRequested) - { - try - { - await DeleteOldBatchJobs(cancellationToken); - } - catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) - { - break; - } - catch (Exception exc) - { - logger.LogError(exc, exc.Message); - } - - try - { - await Task.Delay(runInterval, cancellationToken); - } - catch (TaskCanceledException) - { - break; - } - } - - logger.LogInformation("Batch Job cleanup gracefully stopped."); - } - - private async Task DeleteOldBatchJobs(CancellationToken cancellationToken) - { - var jobsToDelete = await azureProxy.ListOldJobsToDeleteAsync(oldestJobAge, cancellationToken); - - foreach (var jobId in jobsToDelete) - { - var tesTaskId = jobId.Split(new[] { '-' })[0]; - logger.LogInformation($"TES task: {tesTaskId} deleting Batch Job ID: {jobId}"); - - TesTask tesTask = null; - - if (await repository.TryGetItemAsync(tesTaskId, cancellationToken, item => tesTask = item)) - { - if (tesTask.State == TesState.COMPLETEEnum || - tesTask.State == TesState.EXECUTORERROREnum || - tesTask.State == TesState.SYSTEMERROREnum || - tesTask.State == TesState.CANCELEDEnum || - tesTask.State == TesState.UNKNOWNEnum) - { - await azureProxy.DeleteBatchJobAsync(tesTaskId, cancellationToken); - - try - { - await azureProxy.DeleteBatchPoolIfExistsAsync(tesTaskId, cancellationToken); - } - catch (Exception exc) - { - logger.LogError(exc, $"TES task: {tesTaskId} Exception in DeleteOldBatchJobs when attempting to delete the manual batch pool"); - // Do not rethrow - } - } - } - } - } - } -} diff --git a/src/TesApi.Web/DeleteOrphanedAutoPoolsHostedService.cs b/src/TesApi.Web/DeleteOrphanedAutoPoolsHostedService.cs deleted file mode 100644 index 8fbf8ad98..000000000 --- a/src/TesApi.Web/DeleteOrphanedAutoPoolsHostedService.cs +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System; -using System.Linq; -using System.Threading; -using System.Threading.Tasks; -using Microsoft.Extensions.Hosting; -using Microsoft.Extensions.Logging; -using Microsoft.Extensions.Options; - -namespace TesApi.Web -{ - /// - /// Background service to delete Batch auto pools that do not have the corresponding Batch job. - /// This happens in rare cases when auto pool job is rapidly created and deleted but the pool continues with creation, - /// resulting in an active pool with a single node that is not attached to a job. - /// - public class DeleteOrphanedAutoPoolsHostedService : BackgroundService - { - private static readonly TimeSpan runInterval = TimeSpan.FromHours(1); - private static readonly TimeSpan minPoolAge = TimeSpan.FromMinutes(30); - private static readonly string autoPoolIdPrefix = "TES_"; - private readonly IAzureProxy azureProxy; - private readonly ILogger logger; - private readonly bool isDisabled; - - /// - /// Default constructor - /// - /// Configuration of - /// Azure Proxy - /// The logger instance - public DeleteOrphanedAutoPoolsHostedService(IOptions batchSchedulingOptions, IAzureProxy azureProxy, ILogger logger) - { - this.azureProxy = azureProxy; - this.logger = logger; - this.isDisabled = !batchSchedulingOptions.Value.UseLegacyAutopools; - } - - /// - public override Task StartAsync(CancellationToken cancellationToken) - { - if (isDisabled) - { - return Task.CompletedTask; - } - - return base.StartAsync(cancellationToken); - } - - /// - protected override async Task ExecuteAsync(CancellationToken cancellationToken) - { - logger.LogInformation("Orphaned pool cleanup service started"); - - while (!cancellationToken.IsCancellationRequested) - { - try - { - await DeleteOrphanedAutoPoolsAsync(cancellationToken); - } - catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) - { - break; - } - catch (Exception ex) when (!(ex is OperationCanceledException && cancellationToken.IsCancellationRequested)) - { - logger.LogError(ex, ex.Message); - } - - try - { - await Task.Delay(runInterval, cancellationToken); - } - catch (TaskCanceledException) - { - break; - } - } - } - - private async Task DeleteOrphanedAutoPoolsAsync(CancellationToken cancellationToken) - { - var activePoolIds = (await azureProxy.GetActivePoolIdsAsync(autoPoolIdPrefix, minPoolAge, cancellationToken)).ToList(); - - if (activePoolIds.Any()) - { - var poolIdsReferencedByJobs = (await azureProxy.GetPoolIdsReferencedByJobsAsync(cancellationToken)).ToList(); - - var orphanedPoolIds = activePoolIds.Except(poolIdsReferencedByJobs); - - foreach (var orphanedPoolId in orphanedPoolIds) - { - cancellationToken.ThrowIfCancellationRequested(); - logger.LogInformation($"Deleting orphanded pool {orphanedPoolId}, since no jobs reference it."); - await azureProxy.DeleteBatchPoolAsync(orphanedPoolId, cancellationToken); - } - } - } - } -} diff --git a/src/TesApi.Web/DeleteOrphanedBatchJobsHostedService.cs b/src/TesApi.Web/DeleteOrphanedBatchJobsHostedService.cs deleted file mode 100644 index c0b802aa2..000000000 --- a/src/TesApi.Web/DeleteOrphanedBatchJobsHostedService.cs +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System; -using System.Threading; -using System.Threading.Tasks; -using Microsoft.Extensions.Hosting; -using Microsoft.Extensions.Logging; -using Microsoft.Extensions.Options; -using Tes.Models; -using Tes.Repository; - -namespace TesApi.Web -{ - /// - /// Background service to delete active Batch jobs that have no tasks as the result of job creation exception. - /// - public class DeleteOrphanedBatchJobsHostedService : BackgroundService - { - private static readonly TimeSpan runInterval = TimeSpan.FromHours(1); - private static readonly TimeSpan minJobAge = TimeSpan.FromHours(1); - - private readonly IRepository repository; - private readonly IAzureProxy azureProxy; - private readonly ILogger logger; - private readonly bool isDisabled; - - /// - /// Default constructor - /// - /// Configuration of - /// Azure Proxy - /// The main TES task database repository - /// The logger instance - public DeleteOrphanedBatchJobsHostedService(IOptions batchSchedulingOptions, IAzureProxy azureProxy, IRepository repository, ILogger logger) - { - this.repository = repository; - this.azureProxy = azureProxy; - this.logger = logger; - this.isDisabled = !batchSchedulingOptions.Value.UseLegacyAutopools; - } - - /// - public override Task StartAsync(CancellationToken cancellationToken) - { - if (isDisabled) - { - return Task.CompletedTask; - } - - return base.StartAsync(cancellationToken); - } - - /// - protected override async Task ExecuteAsync(CancellationToken cancellationToken) - { - logger.LogInformation("Orphaned job cleanup service started"); - - while (!cancellationToken.IsCancellationRequested) - { - try - { - await DeleteOrphanedJobsAsync(cancellationToken); - } - catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) - { - return; - } - catch (Exception ex) when (!(ex is OperationCanceledException && cancellationToken.IsCancellationRequested)) - { - logger.LogError(ex, ex.Message); - } - - try - { - await Task.Delay(runInterval, cancellationToken); - } - catch (TaskCanceledException) - { - return; - } - } - } - - private async Task DeleteOrphanedJobsAsync(CancellationToken cancellationToken) - { - var jobsToDelete = await azureProxy.ListOrphanedJobsToDeleteAsync(minJobAge, cancellationToken); - - foreach (var jobId in jobsToDelete) - { - var tesTaskId = jobId.Split(new[] { '-' })[0]; - - TesTask tesTask = null; - - if (await repository.TryGetItemAsync(tesTaskId, cancellationToken, item => tesTask = item)) // TODO: Add CancellationToken to IRepository and add unit tests - { - if (tesTask.State == TesState.COMPLETEEnum || - tesTask.State == TesState.EXECUTORERROREnum || - tesTask.State == TesState.SYSTEMERROREnum || - tesTask.State == TesState.CANCELEDEnum || - tesTask.State == TesState.UNKNOWNEnum) - { - await azureProxy.DeleteBatchJobAsync(tesTaskId, cancellationToken); - logger.LogInformation($"Deleted orphaned Batch Job '{jobId}'"); - - await azureProxy.DeleteBatchPoolIfExistsAsync(tesTask.Id, cancellationToken); - } - else - { - logger.LogWarning($"Not deleting orphaned Batch Job '{jobId}' because the corresponding TES task '{tesTaskId}' is in '{tesTask.State}' state."); - } - } - else - { - logger.LogError($"Not deleting orphaned Batch Job '{jobId}' because the corresponding TES task '{tesTaskId}' was not found. Investigate and delete the job manually."); - } - } - } - } -} diff --git a/src/TesApi.Web/IAzureProxy.cs b/src/TesApi.Web/IAzureProxy.cs index 601dfa231..f533e84f5 100644 --- a/src/TesApi.Web/IAzureProxy.cs +++ b/src/TesApi.Web/IAzureProxy.cs @@ -18,23 +18,6 @@ namespace TesApi.Web ///
public interface IAzureProxy { - /// - /// Gets a new Azure Batch job id to schedule another task - /// - /// The unique TES task ID - /// A for controlling the lifetime of the asynchronous operation. - /// The next logical, new Azure Batch job ID - Task GetNextBatchJobIdAsync(string tesTaskId, CancellationToken cancellationToken); - - /// - /// Creates a new Azure Batch job for Autopools - /// - /// - /// - /// - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. - Task CreateAutoPoolModeBatchJobAsync(string jobId, CloudTask cloudTask, PoolInformation poolInformation, CancellationToken cancellationToken); - /// /// Creates a new Azure Batch job for /// @@ -78,10 +61,9 @@ public interface IAzureProxy /// Gets the combined state of Azure Batch job, task and pool that corresponds to the given TES task /// /// The TES task - /// /// A for controlling the lifetime of the asynchronous operation. /// Job state information - Task GetBatchJobAndTaskStateAsync(TesTask tesTask, bool usingAutoPools, CancellationToken cancellationToken); + Task GetBatchJobAndTaskStateAsync(TesTask tesTask, CancellationToken cancellationToken); /// /// Deletes an Azure Batch job for Autopools diff --git a/src/TesApi.Web/Management/BatchQuotaVerifier.cs b/src/TesApi.Web/Management/BatchQuotaVerifier.cs index 9e14e934e..604c622eb 100644 --- a/src/TesApi.Web/Management/BatchQuotaVerifier.cs +++ b/src/TesApi.Web/Management/BatchQuotaVerifier.cs @@ -61,7 +61,7 @@ public BatchQuotaVerifier(BatchAccountResourceInformation batchAccountInformatio } /// - public async Task CheckBatchAccountQuotasAsync(VirtualMachineInformation virtualMachineInformation, bool needPoolOrJobQuotaCheck, bool needCoresUtilizationQuotaCheck, CancellationToken cancellationToken) + public async Task CheckBatchAccountQuotasAsync(VirtualMachineInformation virtualMachineInformation, bool needPoolOrJobQuotaCheck, CancellationToken cancellationToken) { var workflowCoresRequirement = virtualMachineInformation.VCpusAvailable ?? 0; var isDedicated = !virtualMachineInformation.LowPriority; @@ -116,20 +116,6 @@ public async Task CheckBatchAccountQuotasAsync(VirtualMachineInformation virtual throw new AzureBatchQuotaMaxedOutException($"No remaining pool quota available. There are {batchUtilization.ActivePoolsCount} pools in use out of {batchVmFamilyBatchQuotas.PoolQuota}."); } } - - if (needCoresUtilizationQuotaCheck) - { - if ((batchUtilization.TotalCoresInUse + workflowCoresRequirement) > batchVmFamilyBatchQuotas.TotalCoreQuota) - { - throw new AzureBatchQuotaMaxedOutException($"Not enough core quota remaining to schedule task requiring {workflowCoresRequirement} {(isDedicated ? "dedicated" : "low priority")} cores. There are {batchUtilization.TotalCoresInUse} cores in use out of {batchVmFamilyBatchQuotas.TotalCoreQuota}."); - } - - if (isDedicatedAndPerVmFamilyCoreQuotaEnforced && batchUtilization.DedicatedCoresInUseInRequestedVmFamily + workflowCoresRequirement > batchVmFamilyBatchQuotas.VmFamilyQuota) - { - - throw new AzureBatchQuotaMaxedOutException($"Not enough core quota remaining to schedule task requiring {workflowCoresRequirement} dedicated {vmFamily} cores. There are {batchUtilization.DedicatedCoresInUseInRequestedVmFamily} cores in use out of {batchVmFamilyBatchQuotas.VmFamilyQuota}."); - } - } } /// diff --git a/src/TesApi.Web/Management/IBatchQuotaVerifier.cs b/src/TesApi.Web/Management/IBatchQuotaVerifier.cs index c0dcbde95..0c72e0642 100644 --- a/src/TesApi.Web/Management/IBatchQuotaVerifier.cs +++ b/src/TesApi.Web/Management/IBatchQuotaVerifier.cs @@ -17,12 +17,11 @@ public interface IBatchQuotaVerifier /// /// /// Flag to enable checking pool and job quotas. - /// Flag to enable checking core quotas against current utilization. Zero quota checks are always performed. /// A for controlling the lifetime of the asynchronous operation. /// /// Thrown when a task requires more cores than total quota available /// Thrown when a max quota condition was identified - Task CheckBatchAccountQuotasAsync(VirtualMachineInformation virtualMachineInformation, bool needPoolOrJobQuotaCheck, bool needCoresUtilizationQuotaCheck, CancellationToken cancellationToken); + Task CheckBatchAccountQuotasAsync(VirtualMachineInformation virtualMachineInformation, bool needPoolOrJobQuotaCheck, CancellationToken cancellationToken); /// /// Gets the instance of the batch quota provider. diff --git a/src/TesApi.Web/Options/BatchSchedulingOptions.cs b/src/TesApi.Web/Options/BatchSchedulingOptions.cs index a72359972..d1f6f8e1f 100644 --- a/src/TesApi.Web/Options/BatchSchedulingOptions.cs +++ b/src/TesApi.Web/Options/BatchSchedulingOptions.cs @@ -19,17 +19,7 @@ public class BatchSchedulingOptions public const double DefaultPoolRotationForcedDays = 30.0; /// - /// Use legacy Azure Batch Autopools implementation - /// - public bool UseLegacyAutopools { get; set; } = false; - - /// - /// Disables background service to delete Batch jobs older than seven days for completed tasks (only relevant if is set) - /// - public bool DisableJobCleanup { get; set; } = false; - - /// - /// Pool Id prefix and metadata value used to associate batch account pools and jobs to this scheduler. Only relevant if is clear + /// Pool Id prefix and metadata value used to associate batch account pools and jobs to this scheduler. /// /// /// This value must be provided. @@ -37,7 +27,7 @@ public class BatchSchedulingOptions public string Prefix { get; set; } /// - /// Maximum active lifetime of an azure batch pool (only relevant if is clear) + /// Maximum active lifetime of an azure batch pool /// public double PoolRotationForcedDays { get; set; } = DefaultPoolRotationForcedDays; diff --git a/src/TesApi.Web/Startup.cs b/src/TesApi.Web/Startup.cs index d7260dfa9..b3804e959 100644 --- a/src/TesApi.Web/Startup.cs +++ b/src/TesApi.Web/Startup.cs @@ -136,10 +136,7 @@ public void ConfigureServices(IServiceCollection services) // Order is important for hosted services .AddHostedService(sp => (AllowedVmSizesService)sp.GetRequiredService(typeof(IAllowedVmSizesService))) .AddHostedService() - .AddHostedService() - .AddHostedService() - .AddHostedService() - .AddHostedService(); + .AddHostedService(); //.AddHostedService() } catch (Exception exc) diff --git a/src/deploy-tes-on-azure/scripts/helm/templates/tes-deployment.yaml b/src/deploy-tes-on-azure/scripts/helm/templates/tes-deployment.yaml index 6ab76174c..cb0c10087 100644 --- a/src/deploy-tes-on-azure/scripts/helm/templates/tes-deployment.yaml +++ b/src/deploy-tes-on-azure/scripts/helm/templates/tes-deployment.yaml @@ -64,8 +64,6 @@ spec: value: {{ .Values.config.batchNodes.globalManagedIdentity }} - name: BatchScheduling__UsePreemptibleVmsOnly value: {{ .Values.config.batchScheduling.usePreemptibleVmsOnly | quote}} - - name: BatchScheduling__UseLegacyAutopools - value: {{ .Values.config.batchScheduling.useLegacyAutopools | quote }} - name: BatchScheduling__PoolRotationForcedDays value: {{ .Values.config.batchScheduling.poolRotationForcedDays | quote }} - name: BatchScheduling__Prefix diff --git a/src/deploy-tes-on-azure/scripts/helm/values-template.yaml b/src/deploy-tes-on-azure/scripts/helm/values-template.yaml index 57632eea6..ccf5c4128 100644 --- a/src/deploy-tes-on-azure/scripts/helm/values-template.yaml +++ b/src/deploy-tes-on-azure/scripts/helm/values-template.yaml @@ -18,7 +18,6 @@ config: globalManagedIdentity: batchScheduling: usePreemptibleVmsOnly: RUNTIME_PARAMETER - useLegacyAutopools: "False" poolRotationForcedDays: "7" prefix: RUNTIME_PARAMETER nodeImages: From 33d2ba8771336b27b503fb143cfba0b86b07360a Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 26 Sep 2023 19:35:15 -0700 Subject: [PATCH 002/202] Prepare to remove full task batch queries --- src/Tes/Models/TesState.cs | 31 +++++++-- src/Tes/Models/TesTaskExtended.cs | 13 ++-- src/TesApi.Tests/BatchSchedulerTests.cs | 15 +--- src/TesApi.Web/AzureProxy.cs | 69 +++++++++---------- src/TesApi.Web/BatchScheduler.cs | 42 +++++------ .../CachingWithRetriesAzureProxy.cs | 7 +- .../BasicTesTaskContractResolver.cs | 1 - .../FullTesTaskContractResolver.cs | 1 - src/TesApi.Web/Controllers/TaskServiceApi.cs | 11 +-- src/TesApi.Web/IAzureProxy.cs | 13 ++-- .../Management/Batch/ArmBatchPoolManager.cs | 15 +++- src/TesApi.Web/Scheduler.cs | 5 +- 12 files changed, 119 insertions(+), 104 deletions(-) diff --git a/src/Tes/Models/TesState.cs b/src/Tes/Models/TesState.cs index 1567c8560..6ac35977d 100644 --- a/src/Tes/Models/TesState.cs +++ b/src/Tes/Models/TesState.cs @@ -17,13 +17,24 @@ namespace Tes.Models { /// - /// Task states. - UNKNOWN: The state of the task is unknown. This provides a safe default for messages where this field is missing, for example, so that a missing field does not accidentally imply that the state is QUEUED. - QUEUED: The task is queued. - INITIALIZING: The task has been assigned to a worker and is currently preparing to run. For example, the worker may be turning on, downloading input files, etc. - RUNNING: The task is running. Input files are downloaded and the first Executor has been started. - PAUSED: The task is paused. An implementation may have the ability to pause a task, but this is not required. - COMPLETE: The task has completed running. Executors have exited without error and output files have been successfully uploaded. - EXECUTOR_ERROR: The task encountered an error in one of the Executor processes. Generally, this means that an Executor exited with a non-zero exit code. - SYSTEM_ERROR: The task was stopped due to a system error, but not from an Executor, for example an upload failed due to network issues, the worker's ran out of disk space, etc. - CANCELED: The task was canceled by the user. + /// Task states. /// - /// Task states. - UNKNOWN: The state of the task is unknown. This provides a safe default for messages where this field is missing, for example, so that a missing field does not accidentally imply that the state is QUEUED. - QUEUED: The task is queued. - INITIALIZING: The task has been assigned to a worker and is currently preparing to run. For example, the worker may be turning on, downloading input files, etc. - RUNNING: The task is running. Input files are downloaded and the first Executor has been started. - PAUSED: The task is paused. An implementation may have the ability to pause a task, but this is not required. - COMPLETE: The task has completed running. Executors have exited without error and output files have been successfully uploaded. - EXECUTOR_ERROR: The task encountered an error in one of the Executor processes. Generally, this means that an Executor exited with a non-zero exit code. - SYSTEM_ERROR: The task was stopped due to a system error, but not from an Executor, for example an upload failed due to network issues, the worker's ran out of disk space, etc. - CANCELED: The task was canceled by the user. + /// + /// - UNKNOWN: The state of the task is unknown. This provides a safe default for messages where this field is missing, for example, so that a missing field does not accidentally imply that the state is QUEUED. + /// - QUEUED: The task is queued. + /// - INITIALIZING: The task has been assigned to a worker and is currently preparing to run. For example, the worker may be turning on, downloading input files, etc. + /// - RUNNING: The task is running. Input files are downloaded and the first Executor has been started. + /// - PAUSED: The task is paused. An implementation may have the ability to pause a task, but this is not required. + /// - COMPLETE: The task has completed running. Executors have exited without error and output files have been successfully uploaded. + /// - EXECUTOR_ERROR: The task encountered an error in one of the Executor processes. Generally, this means that an Executor exited with a non-zero exit code. + /// - SYSTEM_ERROR: The task was stopped due to a system error, but not from an Executor, for example an upload failed due to network issues, the worker's ran out of disk space, etc. + /// - CANCELED: The task was canceled by the user. + /// - PREEMPTED: The task is stopped (preempted) by the system. The reasons for this would be tied to the specific system running the job. Generally, this means that the system reclaimed the compute capacity for reallocation. + /// - CANCELING: The task was canceled by the user, but the downstream resources are still awaiting deletion. + /// [JsonConverter(typeof(Newtonsoft.Json.Converters.StringEnumConverter))] public enum TesState { - /// /// Enum UNKNOWNEnum for UNKNOWN /// @@ -76,6 +87,18 @@ public enum TesState /// Enum CANCELEDEnum for CANCELED /// [EnumMember(Value = "CANCELED")] - CANCELEDEnum = 9 + CANCELEDEnum = 9, + + /// + /// Enum PREEMPTEDEnum for PREEMPTED + /// + [EnumMember(Value = "PREEMPTED")] + PREEMPTEDEnum = 10, + + /// + /// Enum CANCELINGEnum for CANCELING + /// + [EnumMember(Value = "CANCELING")] + CANCELINGEnum = 11 } } diff --git a/src/Tes/Models/TesTaskExtended.cs b/src/Tes/Models/TesTaskExtended.cs index e1635cd1e..003670b21 100644 --- a/src/Tes/Models/TesTaskExtended.cs +++ b/src/Tes/Models/TesTaskExtended.cs @@ -15,11 +15,14 @@ public partial class TesTask : RepositoryItem private static readonly Regex CromwellTaskInstanceNameRegex = new("(.*):[^:]*:[^:]*"); private static readonly Regex CromwellShardRegex = new(".*:([^:]*):[^:]*"); private static readonly Regex CromwellAttemptRegex = new(".*:([^:]*)"); - public static readonly List ActiveStates = new List { + + public static readonly List ActiveStates = new() { TesState.QUEUEDEnum, TesState.RUNNINGEnum, TesState.PAUSEDEnum, - TesState.INITIALIZINGEnum}; + TesState.INITIALIZINGEnum, + TesState.CANCELINGEnum + }; /// /// Number of retries attempted @@ -27,12 +30,6 @@ public partial class TesTask : RepositoryItem [DataMember(Name = "error_count")] public int ErrorCount { get; set; } - /// - /// Boolean of whether cancellation was requested - /// - [DataMember(Name = "is_cancel_requested")] - public bool IsCancelRequested { get; set; } - /// /// Date + time the task was completed, in RFC 3339 format. This is set by the system, not the client. /// diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 29b32cbf7..6152ffa7a 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -868,7 +868,7 @@ public async Task TaskThatFailsWithNodeAllocationErrorIsRequeuedOnDifferentVmSiz [TestMethod] public async Task TaskGetsCancelled() { - var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELEDEnum, IsCancelRequested = true }; + var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELINGEnum }; var azureProxyReturnValues = AzureProxyReturnValues.Defaults; azureProxyReturnValues.BatchJobAndTaskState = BatchJobAndTaskStates.TaskActive; @@ -884,8 +884,7 @@ public async Task TaskGetsCancelled() GuardAssertsWithTesTask(tesTask, () => { Assert.AreEqual(TesState.CANCELEDEnum, tesTask.State); - Assert.IsFalse(tesTask.IsCancelRequested); - azureProxy.Verify(i => i.DeleteBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); + azureProxy.Verify(i => i.TerminateBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); }); } @@ -1664,10 +1663,6 @@ private static Action> GetMockAzureProxy(AzureProxyReturnValue azureProxy.Setup(a => a.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) .Returns((Pool p, bool _1, System.Threading.CancellationToken _2) => Task.FromResult(azureProxyReturnValues.CreateBatchPoolImpl(p))); - azureProxy.Setup(a => a.DeleteBatchPoolIfExistsAsync(It.IsAny(), It.IsAny())) - .Callback((poolId, cancellationToken) => azureProxyReturnValues.AzureProxyDeleteBatchPoolIfExistsImpl(poolId, cancellationToken)) - .Returns(Task.CompletedTask); - azureProxy.Setup(a => a.GetFullAllocationStateAsync(It.IsAny(), It.IsAny())) .Returns(Task.FromResult(azureProxyReturnValues.AzureProxyGetFullAllocationState?.Invoke() ?? (null, null, null, null, null, null))); @@ -1871,12 +1866,6 @@ private static AzureProxyReturnValues DefaultsPerVMFamilyEnforcedImpl() private readonly Dictionary> poolMetadata = new(); - internal void AzureProxyDeleteBatchPoolIfExistsImpl(string poolId, System.Threading.CancellationToken cancellationToken) - { - _ = poolMetadata.Remove(poolId); - AzureProxyDeleteBatchPoolIfExists(poolId, cancellationToken); - } - internal void AzureProxyDeleteBatchPoolImpl(string poolId, System.Threading.CancellationToken cancellationToken) { _ = poolMetadata.Remove(poolId); diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 96ada7d27..1614a6aef 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -396,6 +396,39 @@ public async Task DeleteBatchJobAsync(string tesTaskId, CancellationToken cancel } } + /// + public async Task TerminateBatchTaskAsync(string tesTaskId, PoolInformation pool, CancellationToken cancellationToken) + { + var jobFilter = new ODATADetailLevel + { + FilterClause = $"startswith(id,'{tesTaskId}{BatchJobAttemptSeparator}')", + SelectClause = "id" + }; + + List batchTasksToDelete = default; + + try + { + batchTasksToDelete = await batchClient.JobOperations.ListTasks(pool.PoolId, jobFilter).ToAsyncEnumerable().ToListAsync(cancellationToken); + } + catch (BatchException ex) when (ex.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException bee && "JobNotFound".Equals(bee.Body?.Code, StringComparison.InvariantCultureIgnoreCase)) + { + logger.LogWarning("Job not found for TES task {TesTask}", tesTaskId); + return; // Task cannot exist if the job is not found. + } + + if (batchTasksToDelete.Count > 1) + { + logger.LogWarning("Found more than one active task for TES task {TesTask}", tesTaskId); + } + + foreach (var task in batchTasksToDelete) + { + logger.LogInformation("Terminating task {BatchTask}", task.Id); + await batchNodeNotReadyRetryPolicy.ExecuteAsync(ct => task.TerminateAsync(cancellationToken: ct), cancellationToken); + } + } + /// public async Task DeleteBatchTaskAsync(string tesTaskId, PoolInformation pool, CancellationToken cancellationToken) { @@ -497,42 +530,6 @@ public Task DeleteBatchComputeNodesAsync(string poolId, IEnumerable public Task DeleteBatchPoolAsync(string poolId, CancellationToken cancellationToken = default) => batchPoolManager.DeleteBatchPoolAsync(poolId, cancellationToken: cancellationToken); - /// - public async Task DeleteBatchPoolIfExistsAsync(string poolId, CancellationToken cancellationToken = default) - { - try - { - var poolFilter = new ODATADetailLevel - { - FilterClause = $"startswith(id,'{poolId}') and state ne 'deleting'", - SelectClause = "id" - }; - - var poolsToDelete = await batchClient.PoolOperations.ListPools(poolFilter).ToListAsync(cancellationToken); - - foreach (var pool in poolsToDelete) - { - logger.LogInformation($"Pool ID: {pool.Id} Pool State: {pool?.State} deleting..."); - await batchClient.PoolOperations.DeletePoolAsync(pool.Id, cancellationToken: cancellationToken); - } - } - catch (Exception exc) - { - var batchErrorCode = (exc as BatchException)?.RequestInformation?.BatchError?.Code; - - if (batchErrorCode?.Trim().Equals("PoolBeingDeleted", StringComparison.OrdinalIgnoreCase) == true) - { - // Do not throw if it's a deletion race condition - // Docs: https://learn.microsoft.com/en-us/rest/api/batchservice/Pool/Delete?tabs=HTTP - - return; - } - - logger.LogError(exc, $"Pool ID: {poolId} exception while attempting to delete the pool. Batch error code: {batchErrorCode}"); - throw; - } - } - /// public Task GetBatchPoolAsync(string poolId, CancellationToken cancellationToken = default, DetailLevel detailLevel = default) => batchClient.PoolOperations.GetPoolAsync(poolId, detailLevel: detailLevel, cancellationToken: cancellationToken); diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 09d45c344..59e34c91b 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -185,7 +185,8 @@ public BatchScheduler( static bool tesTaskIsInitializingOrRunning(TesTask tesTask) => tesTask.State == TesState.INITIALIZINGEnum || tesTask.State == TesState.RUNNINGEnum; static bool tesTaskIsQueuedOrInitializing(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum || tesTask.State == TesState.INITIALIZINGEnum; static bool tesTaskIsQueued(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum; - static bool tesTaskCancellationRequested(TesTask tesTask) => tesTask.State == TesState.CANCELEDEnum && tesTask.IsCancelRequested; + static bool tesTaskCancellationRequested(TesTask tesTask) => tesTask.State == TesState.CANCELINGEnum; + //static bool tesTaskDeletionReady(TesTask tesTask) => tesTask.IsTaskDeletionRequired; static void SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo) { @@ -219,26 +220,25 @@ static void SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CombinedB async Task SetTaskCompleted(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await DeleteBatchTaskAndOrJobAndOrPoolIfExists(azureProxy, tesTask, batchInfo, cancellationToken); + await TerminateBatchTaskAsync(tesTask, batchInfo, cancellationToken); SetTaskStateAndLog(tesTask, TesState.COMPLETEEnum, batchInfo); } async Task SetTaskExecutorError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await DeleteBatchTaskAndOrJobAndOrPoolIfExists(azureProxy, tesTask, batchInfo, cancellationToken); + await TerminateBatchTaskAsync(tesTask, batchInfo, cancellationToken); SetTaskStateAndLog(tesTask, TesState.EXECUTORERROREnum, batchInfo); } async Task SetTaskSystemError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await DeleteBatchTaskAndOrJobAndOrPoolIfExists(azureProxy, tesTask, batchInfo, cancellationToken); + await TerminateBatchTaskAsync(tesTask, batchInfo, cancellationToken); SetTaskStateAndLog(tesTask, TesState.SYSTEMERROREnum, batchInfo); } async Task DeleteBatchJobAndSetTaskStateAsync(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, batchInfo.Pool, cancellationToken); - await azureProxy.DeleteBatchPoolIfExistsAsync(tesTask.Id, cancellationToken); + await TerminateBatchTaskAsync(tesTask, batchInfo, cancellationToken); SetTaskStateAndLog(tesTask, newTaskState, batchInfo); } @@ -256,11 +256,10 @@ Task AddSystemLogAndDeleteBatchJobAndSetTaskExecutorErrorAsync(TesTask tesTask, return DeleteBatchJobAndSetTaskExecutorErrorAsync(tesTask, batchInfo, cancellationToken); } - async Task CancelTaskAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) + Task CancelTaskAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, batchInfo.Pool, cancellationToken); - await azureProxy.DeleteBatchPoolIfExistsAsync(tesTask.Id, cancellationToken); - tesTask.IsCancelRequested = false; + tesTask.State = TesState.CANCELEDEnum; + return TerminateBatchTaskAsync(tesTask, batchInfo, cancellationToken); } Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) @@ -272,6 +271,7 @@ Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, tesTaskStateTransitions = new List() { + //new TesTaskStateTransition(tesTaskDeletionReady, batchTaskState: null, alternateSystemLogItem: null, DeleteCancelledTaskAsync), new TesTaskStateTransition(tesTaskCancellationRequested, batchTaskState: null, alternateSystemLogItem: null, CancelTaskAsync), new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.JobNotFound, alternateSystemLogItem: null, (tesTask, _, ct) => AddBatchTaskAsync(tesTask, ct)), new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.MissingBatchTask, alternateSystemLogItem: null, (tesTask, batchInfo, ct) => AddBatchTaskAsync(tesTask, ct)), @@ -290,6 +290,12 @@ Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, }; } + //private async Task DeleteCancelledTaskAsync(TesTask task, CombinedBatchTaskInfo info, CancellationToken token) + //{ + // // TODO: check if task is old enough to delete. + // await azureProxy.DeleteBatchTaskAsync(task.Id, info.Pool, token); + //} + private Task WarnWhenUnableToFindPoolToDeleteTask(TesTask tesTask) { logger.LogWarning("Unable to delete batch task for task {TesTask} because of missing pool/job information.", tesTask.Id); @@ -297,23 +303,17 @@ private Task WarnWhenUnableToFindPoolToDeleteTask(TesTask tesTask) return Task.CompletedTask; } - private async Task DeleteBatchTaskAndOrJobAndOrPoolIfExists(IAzureProxy azureProxy, TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) + private async Task TerminateBatchTaskAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - var batchDeletionExceptions = new List(); - try { - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, batchInfo.Pool, cancellationToken); + await azureProxy.TerminateBatchTaskAsync(tesTask.Id, batchInfo.Pool, cancellationToken); + //tesTask.IsTaskDeletionRequired = true; } catch (Exception exc) { - logger.LogError(exc, $"Exception deleting batch task or job with tesTask.Id: {tesTask?.Id}"); - batchDeletionExceptions.Add(exc); - } - - if (batchDeletionExceptions.Any()) - { - throw new AggregateException(batchDeletionExceptions); + logger.LogError(exc, "Exception deleting batch task with tesTask.Id: {TesTaskId}", tesTask?.Id); + throw; } } diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index d228de624..67babb914 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -93,6 +93,9 @@ public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, PoolI /// public Task DeleteBatchJobAsync(string taskId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(taskId, ct), cancellationToken); + /// + public Task TerminateBatchTaskAsync(string taskId, PoolInformation poolInformation, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.TerminateBatchTaskAsync(taskId, poolInformation, ct), cancellationToken); + /// public Task DeleteBatchTaskAsync(string taskId, PoolInformation poolInformation, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(taskId, poolInformation, ct), cancellationToken); @@ -203,10 +206,6 @@ public async Task CreateBatchPoolAsync(BatchModels.Pool poolInf } } - /// - public Task DeleteBatchPoolIfExistsAsync(string poolId, CancellationToken cancellationToken) - => azureProxy.DeleteBatchPoolIfExistsAsync(poolId, cancellationToken); - /// public Task<(AllocationState? AllocationState, bool? AutoScaleEnabled, int? TargetLowPriority, int? CurrentLowPriority, int? TargetDedicated, int? CurrentDedicated)> GetFullAllocationStateAsync(string poolId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAndCachingAsync( diff --git a/src/TesApi.Web/ContractResolvers/BasicTesTaskContractResolver.cs b/src/TesApi.Web/ContractResolvers/BasicTesTaskContractResolver.cs index 311ff238f..3b2a7ce4b 100644 --- a/src/TesApi.Web/ContractResolvers/BasicTesTaskContractResolver.cs +++ b/src/TesApi.Web/ContractResolvers/BasicTesTaskContractResolver.cs @@ -28,7 +28,6 @@ public class BasicTesTaskContractResolver : DefaultContractResolver Tuple.Create(typeof(TesExecutorLog), nameof(TesExecutorLog.Stderr)), Tuple.Create(typeof(TesInput), nameof(TesInput.Content)), Tuple.Create(typeof(TesTaskLog), nameof(TesTaskLog.SystemLogs)), - Tuple.Create(typeof(TesTask), nameof(TesTask.IsCancelRequested)), Tuple.Create(typeof(TesTask), nameof(TesTask.ErrorCount)), Tuple.Create(typeof(TesTask), nameof(TesTask.EndTime)), Tuple.Create(typeof(TesTask), nameof(TesTask.PoolId)), diff --git a/src/TesApi.Web/ContractResolvers/FullTesTaskContractResolver.cs b/src/TesApi.Web/ContractResolvers/FullTesTaskContractResolver.cs index f6101b714..4029212f0 100644 --- a/src/TesApi.Web/ContractResolvers/FullTesTaskContractResolver.cs +++ b/src/TesApi.Web/ContractResolvers/FullTesTaskContractResolver.cs @@ -19,7 +19,6 @@ public class FullTesTaskContractResolver : DefaultContractResolver // In FULL view, task message will include all fields EXCEPT custom fields added to support running TES with Cromwell on Azure private static readonly List> PropertiesToSkip = new() { - Tuple.Create(typeof(TesTask), nameof(TesTask.IsCancelRequested)), Tuple.Create(typeof(TesTask), nameof(TesTask.ErrorCount)), Tuple.Create(typeof(TesTask), nameof(TesTask.EndTime)), Tuple.Create(typeof(TesTask), nameof(TesTask.PoolId)), diff --git a/src/TesApi.Web/Controllers/TaskServiceApi.cs b/src/TesApi.Web/Controllers/TaskServiceApi.cs index cf4f8f3ee..20342d09e 100644 --- a/src/TesApi.Web/Controllers/TaskServiceApi.cs +++ b/src/TesApi.Web/Controllers/TaskServiceApi.cs @@ -84,15 +84,16 @@ public virtual async Task CancelTask([FromRoute][Required] string { if (tesTask.State == TesState.COMPLETEEnum || tesTask.State == TesState.EXECUTORERROREnum || - tesTask.State == TesState.SYSTEMERROREnum) + tesTask.State == TesState.SYSTEMERROREnum || + tesTask.State == TesState.PREEMPTEDEnum || + tesTask.State == TesState.CANCELINGEnum) { - logger.LogInformation($"Task {id} cannot be canceled because it is in {tesTask.State} state."); + logger.LogInformation("Task {TesTaskId} cannot be canceled because it is in {TesTaskState} state.", id, tesTask.State); } else if (tesTask.State != TesState.CANCELEDEnum) { logger.LogInformation("Canceling task"); - tesTask.IsCancelRequested = true; - tesTask.State = TesState.CANCELEDEnum; + tesTask.State = TesState.CANCELINGEnum; try { @@ -100,7 +101,7 @@ public virtual async Task CancelTask([FromRoute][Required] string } catch (RepositoryCollisionException exc) { - logger.LogError(exc, $"RepositoryCollisionException in CancelTask for {id}"); + logger.LogError(exc, "RepositoryCollisionException in CancelTask for {TesTaskId}", id); return Conflict(new { message = "The task could not be updated due to a conflict with the current state; please retry." }); } } diff --git a/src/TesApi.Web/IAzureProxy.cs b/src/TesApi.Web/IAzureProxy.cs index f533e84f5..18bb587a1 100644 --- a/src/TesApi.Web/IAzureProxy.cs +++ b/src/TesApi.Web/IAzureProxy.cs @@ -72,6 +72,14 @@ public interface IAzureProxy /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. Task DeleteBatchJobAsync(string taskId, CancellationToken cancellationToken); + /// + /// Terminates an Azure Batch task + /// + /// The unique TES task ID + /// + /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + Task TerminateBatchTaskAsync(string taskId, PoolInformation poolInformation, CancellationToken cancellationToken); + /// /// Deletes an Azure Batch task /// @@ -203,11 +211,6 @@ public interface IAzureProxy /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. Task DeleteBatchPoolAsync(string poolId, CancellationToken cancellationToken); - /// - /// Deletes the specified pool if it exists - /// - Task DeleteBatchPoolIfExistsAsync(string poolId, CancellationToken cancellationToken); - /// /// Retrieves the specified pool /// diff --git a/src/TesApi.Web/Management/Batch/ArmBatchPoolManager.cs b/src/TesApi.Web/Management/Batch/ArmBatchPoolManager.cs index 2333c9758..9418154c2 100644 --- a/src/TesApi.Web/Management/Batch/ArmBatchPoolManager.cs +++ b/src/TesApi.Web/Management/Batch/ArmBatchPoolManager.cs @@ -5,6 +5,7 @@ using System.Threading; using System.Threading.Tasks; using Microsoft.Azure.Batch; +using Microsoft.Azure.Batch.Common; using Microsoft.Azure.Management.Batch; using Microsoft.Azure.Management.Batch.Models; using Microsoft.Extensions.Logging; @@ -76,9 +77,19 @@ await batchManagementClient.Pool.DeleteWithHttpMessagesAsync( $"Successfully deleted pool with the id/name:{poolId} in Batch account:{azureClientsFactory.BatchAccountInformation.Name}"); } - catch (Exception e) + catch (Exception exc) { - logger.LogError(e, $"Error trying to delete pool named {poolId}"); + var batchErrorCode = (exc as BatchException)?.RequestInformation?.BatchError?.Code; + + if (batchErrorCode?.Trim().Equals("PoolBeingDeleted", StringComparison.OrdinalIgnoreCase) == true) + { + // Do not throw if it's a deletion race condition + // Docs: https://learn.microsoft.com/en-us/rest/api/batchservice/Pool/Delete?tabs=HTTP + + return; + } + + logger.LogError(exc, "Error trying to delete pool named {PoolId}", poolId); throw; } diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index 63604951e..819295c38 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -103,10 +103,7 @@ private async ValueTask OrchestrateTesTasksOnBatch(CancellationToken stoppingTok var pools = new HashSet(); var tesTasks = (await repository.GetItemsAsync( - predicate: t => t.State == TesState.QUEUEDEnum - || t.State == TesState.INITIALIZINGEnum - || t.State == TesState.RUNNINGEnum - || (t.State == TesState.CANCELEDEnum && t.IsCancelRequested), + predicate: t => t.State == TesState.QUEUEDEnum || t.State == TesState.CANCELINGEnum, cancellationToken: stoppingToken)) .OrderBy(t => t.CreationTime) .ToList(); From 6c2febc238d9b2c21ab0a3f872af378ba911e96b Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 27 Sep 2023 09:32:17 -0700 Subject: [PATCH 003/202] Fix issues in previous commit --- src/Tes/Models/TesTaskExtended.cs | 12 ++++++- src/TesApi.Tests/BatchSchedulerTests.cs | 28 ++++++++++++++-- src/TesApi.Web/BatchScheduler.cs | 32 ++++++++----------- .../BasicTesTaskContractResolver.cs | 23 ++++++------- .../FullTesTaskContractResolver.cs | 15 +++++---- src/TesApi.Web/Scheduler.cs | 2 +- 6 files changed, 70 insertions(+), 42 deletions(-) diff --git a/src/Tes/Models/TesTaskExtended.cs b/src/Tes/Models/TesTaskExtended.cs index 003670b21..357f4e495 100644 --- a/src/Tes/Models/TesTaskExtended.cs +++ b/src/Tes/Models/TesTaskExtended.cs @@ -30,6 +30,12 @@ public partial class TesTask : RepositoryItem [DataMember(Name = "error_count")] public int ErrorCount { get; set; } + /// + /// Boolean of whether deletion after termination is required. + /// + [DataMember(Name = "is_delete_required")] + public bool IsTaskDeletionRequired { get; set; } + /// /// Date + time the task was completed, in RFC 3339 format. This is set by the system, not the client. /// @@ -85,9 +91,13 @@ public partial class TesTask : RepositoryItem [IgnoreDataMember] public int? CromwellAttempt => this.Description == null ? null : (int.TryParse(CromwellAttemptRegex.Match(this.Description).Groups[1].Value, out var result) ? result : null); + /// + /// True if task should be kept in the cache. + /// + /// public bool IsActiveState() { - return ActiveStates.Contains(this.State); + return ActiveStates.Contains(this.State) || IsTaskDeletionRequired; } } } diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 6152ffa7a..b593fdb15 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -884,10 +884,34 @@ public async Task TaskGetsCancelled() GuardAssertsWithTesTask(tesTask, () => { Assert.AreEqual(TesState.CANCELEDEnum, tesTask.State); + Assert.IsTrue(tesTask.IsTaskDeletionRequired); azureProxy.Verify(i => i.TerminateBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); }); } + [TestMethod] + public async Task CancelledTaskGetsDeleted() + { + var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELEDEnum, IsTaskDeletionRequired = true }; + + var azureProxyReturnValues = AzureProxyReturnValues.Defaults; + azureProxyReturnValues.BatchJobAndTaskState = BatchJobAndTaskStates.TaskActive; + Mock azureProxy = default; + var azureProxySetter = new Action>(mock => + { + GetMockAzureProxy(azureProxyReturnValues)(mock); + azureProxy = mock; + }); + + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, azureProxyReturnValues); + + GuardAssertsWithTesTask(tesTask, () => + { + Assert.IsFalse(tesTask.IsTaskDeletionRequired); + azureProxy.Verify(i => i.DeleteBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); + }); + } + [TestMethod] public async Task SuccessfullyCompletedTaskContainsBatchNodeMetrics() { @@ -1580,11 +1604,9 @@ private static Action> GetMockQuotaProvider(AzureProxy private static TestServices.TestServiceProvider GetServiceProvider(IEnumerable<(string Key, string Value)> configuration, Action> azureProxy, Action> quotaProvider, Action> skuInfoProvider, Action> containerRegistryProviderSetup, Action> allowedVmSizesServiceSetup, Action additionalActions = default) => new(wrapAzureProxy: true, configuration: configuration, azureProxy: azureProxy, batchQuotaProvider: quotaProvider, batchSkuInformationProvider: skuInfoProvider, accountResourceInformation: GetNewBatchResourceInfo(), containerRegistryProviderSetup: containerRegistryProviderSetup, allowedVmSizesServiceSetup: allowedVmSizesServiceSetup, additionalActions: additionalActions); - // TODO: Are these tests still valid? private static async Task GetNewTesTaskStateAsync(TesTask tesTask, AzureProxyReturnValues azureProxyReturnValues) { - //_ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); - var test = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); return tesTask.State; } diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 59e34c91b..47a3de114 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -186,7 +186,7 @@ public BatchScheduler( static bool tesTaskIsQueuedOrInitializing(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum || tesTask.State == TesState.INITIALIZINGEnum; static bool tesTaskIsQueued(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum; static bool tesTaskCancellationRequested(TesTask tesTask) => tesTask.State == TesState.CANCELINGEnum; - //static bool tesTaskDeletionReady(TesTask tesTask) => tesTask.IsTaskDeletionRequired; + static bool tesTaskDeletionReady(TesTask tesTask) => tesTask.IsTaskDeletionRequired; static void SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo) { @@ -220,25 +220,25 @@ static void SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CombinedB async Task SetTaskCompleted(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await TerminateBatchTaskAsync(tesTask, batchInfo, cancellationToken); + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, batchInfo.Pool, cancellationToken); SetTaskStateAndLog(tesTask, TesState.COMPLETEEnum, batchInfo); } async Task SetTaskExecutorError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await TerminateBatchTaskAsync(tesTask, batchInfo, cancellationToken); + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, batchInfo.Pool, cancellationToken); SetTaskStateAndLog(tesTask, TesState.EXECUTORERROREnum, batchInfo); } async Task SetTaskSystemError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await TerminateBatchTaskAsync(tesTask, batchInfo, cancellationToken); + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, batchInfo.Pool, cancellationToken); SetTaskStateAndLog(tesTask, TesState.SYSTEMERROREnum, batchInfo); } async Task DeleteBatchJobAndSetTaskStateAsync(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await TerminateBatchTaskAsync(tesTask, batchInfo, cancellationToken); + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, batchInfo.Pool, cancellationToken); SetTaskStateAndLog(tesTask, newTaskState, batchInfo); } @@ -258,7 +258,6 @@ Task AddSystemLogAndDeleteBatchJobAndSetTaskExecutorErrorAsync(TesTask tesTask, Task CancelTaskAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - tesTask.State = TesState.CANCELEDEnum; return TerminateBatchTaskAsync(tesTask, batchInfo, cancellationToken); } @@ -271,7 +270,7 @@ Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, tesTaskStateTransitions = new List() { - //new TesTaskStateTransition(tesTaskDeletionReady, batchTaskState: null, alternateSystemLogItem: null, DeleteCancelledTaskAsync), + new TesTaskStateTransition(tesTaskDeletionReady, batchTaskState: null, alternateSystemLogItem: null, DeleteCancelledTaskAsync), new TesTaskStateTransition(tesTaskCancellationRequested, batchTaskState: null, alternateSystemLogItem: null, CancelTaskAsync), new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.JobNotFound, alternateSystemLogItem: null, (tesTask, _, ct) => AddBatchTaskAsync(tesTask, ct)), new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.MissingBatchTask, alternateSystemLogItem: null, (tesTask, batchInfo, ct) => AddBatchTaskAsync(tesTask, ct)), @@ -290,17 +289,11 @@ Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, }; } - //private async Task DeleteCancelledTaskAsync(TesTask task, CombinedBatchTaskInfo info, CancellationToken token) - //{ - // // TODO: check if task is old enough to delete. - // await azureProxy.DeleteBatchTaskAsync(task.Id, info.Pool, token); - //} - - private Task WarnWhenUnableToFindPoolToDeleteTask(TesTask tesTask) + private async Task DeleteCancelledTaskAsync(TesTask task, CombinedBatchTaskInfo info, CancellationToken token) { - logger.LogWarning("Unable to delete batch task for task {TesTask} because of missing pool/job information.", tesTask.Id); - tesTask.SetWarning("Unable to delete batch task because of missing pool/job information."); - return Task.CompletedTask; + // TODO: check if task is old enough to delete. + await azureProxy.DeleteBatchTaskAsync(task.Id, info.Pool, token); + task.IsTaskDeletionRequired = false; } private async Task TerminateBatchTaskAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) @@ -308,11 +301,12 @@ private async Task TerminateBatchTaskAsync(TesTask tesTask, CombinedBatchTaskInf try { await azureProxy.TerminateBatchTaskAsync(tesTask.Id, batchInfo.Pool, cancellationToken); - //tesTask.IsTaskDeletionRequired = true; + tesTask.IsTaskDeletionRequired = true; + tesTask.State = TesState.CANCELEDEnum; } catch (Exception exc) { - logger.LogError(exc, "Exception deleting batch task with tesTask.Id: {TesTaskId}", tesTask?.Id); + logger.LogError(exc, "Exception terminating batch task with tesTask.Id: {TesTaskId}", tesTask?.Id); throw; } } diff --git a/src/TesApi.Web/ContractResolvers/BasicTesTaskContractResolver.cs b/src/TesApi.Web/ContractResolvers/BasicTesTaskContractResolver.cs index 3b2a7ce4b..feae42730 100644 --- a/src/TesApi.Web/ContractResolvers/BasicTesTaskContractResolver.cs +++ b/src/TesApi.Web/ContractResolvers/BasicTesTaskContractResolver.cs @@ -24,17 +24,18 @@ public class BasicTesTaskContractResolver : DefaultContractResolver // plus additional custom fields added to support running TES with Cromwell on Azure private static readonly List> PropertiesToSkip = new() { - Tuple.Create(typeof(TesExecutorLog), nameof(TesExecutorLog.Stdout)), - Tuple.Create(typeof(TesExecutorLog), nameof(TesExecutorLog.Stderr)), - Tuple.Create(typeof(TesInput), nameof(TesInput.Content)), - Tuple.Create(typeof(TesTaskLog), nameof(TesTaskLog.SystemLogs)), - Tuple.Create(typeof(TesTask), nameof(TesTask.ErrorCount)), - Tuple.Create(typeof(TesTask), nameof(TesTask.EndTime)), - Tuple.Create(typeof(TesTask), nameof(TesTask.PoolId)), - Tuple.Create(typeof(TesTask), nameof(TesTask.WorkflowId)), - Tuple.Create(typeof(RepositoryItem), nameof(RepositoryItem.ETag)), - Tuple.Create(typeof(RepositoryItem), nameof(RepositoryItem.PartitionKey)) - }; + Tuple.Create(typeof(TesExecutorLog), nameof(TesExecutorLog.Stdout)), + Tuple.Create(typeof(TesExecutorLog), nameof(TesExecutorLog.Stderr)), + Tuple.Create(typeof(TesInput), nameof(TesInput.Content)), + Tuple.Create(typeof(TesTaskLog), nameof(TesTaskLog.SystemLogs)), + Tuple.Create(typeof(TesTask), nameof(TesTask.IsTaskDeletionRequired)), + Tuple.Create(typeof(TesTask), nameof(TesTask.ErrorCount)), + Tuple.Create(typeof(TesTask), nameof(TesTask.EndTime)), + Tuple.Create(typeof(TesTask), nameof(TesTask.PoolId)), + Tuple.Create(typeof(TesTask), nameof(TesTask.WorkflowId)), + Tuple.Create(typeof(RepositoryItem), nameof(RepositoryItem.ETag)), + Tuple.Create(typeof(RepositoryItem), nameof(RepositoryItem.PartitionKey)) + }; /// /// Instance of the resolver diff --git a/src/TesApi.Web/ContractResolvers/FullTesTaskContractResolver.cs b/src/TesApi.Web/ContractResolvers/FullTesTaskContractResolver.cs index 4029212f0..e9c549443 100644 --- a/src/TesApi.Web/ContractResolvers/FullTesTaskContractResolver.cs +++ b/src/TesApi.Web/ContractResolvers/FullTesTaskContractResolver.cs @@ -19,13 +19,14 @@ public class FullTesTaskContractResolver : DefaultContractResolver // In FULL view, task message will include all fields EXCEPT custom fields added to support running TES with Cromwell on Azure private static readonly List> PropertiesToSkip = new() { - Tuple.Create(typeof(TesTask), nameof(TesTask.ErrorCount)), - Tuple.Create(typeof(TesTask), nameof(TesTask.EndTime)), - Tuple.Create(typeof(TesTask), nameof(TesTask.PoolId)), - Tuple.Create(typeof(TesTask), nameof(TesTask.WorkflowId)), - Tuple.Create(typeof(RepositoryItem), nameof(RepositoryItem.ETag)), - Tuple.Create(typeof(RepositoryItem), nameof(RepositoryItem.PartitionKey)) - }; + Tuple.Create(typeof(TesTask), nameof(TesTask.IsTaskDeletionRequired)), + Tuple.Create(typeof(TesTask), nameof(TesTask.ErrorCount)), + Tuple.Create(typeof(TesTask), nameof(TesTask.EndTime)), + Tuple.Create(typeof(TesTask), nameof(TesTask.PoolId)), + Tuple.Create(typeof(TesTask), nameof(TesTask.WorkflowId)), + Tuple.Create(typeof(RepositoryItem), nameof(RepositoryItem.ETag)), + Tuple.Create(typeof(RepositoryItem), nameof(RepositoryItem.PartitionKey)) + }; /// /// Instance of the resolver diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index 819295c38..a356b0141 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -103,7 +103,7 @@ private async ValueTask OrchestrateTesTasksOnBatch(CancellationToken stoppingTok var pools = new HashSet(); var tesTasks = (await repository.GetItemsAsync( - predicate: t => t.State == TesState.QUEUEDEnum || t.State == TesState.CANCELINGEnum, + predicate: t => t.State == TesState.QUEUEDEnum || t.State == TesState.CANCELINGEnum || t.IsTaskDeletionRequired, cancellationToken: stoppingToken)) .OrderBy(t => t.CreationTime) .ToList(); From cdb2a1754c8f147e6ac9e586c2d9cbcaa17da3a0 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 27 Sep 2023 17:05:07 -0700 Subject: [PATCH 004/202] Reimplement queueing tasks to batch --- src/TesApi.Tests/BatchSchedulerTests.cs | 20 +- src/TesApi.Web/BatchScheduler.cs | 451 ++++++++++++++---------- src/TesApi.Web/IBatchScheduler.cs | 40 ++- src/TesApi.Web/Scheduler.cs | 88 ++++- 4 files changed, 374 insertions(+), 225 deletions(-) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index b593fdb15..2adb6605e 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -607,7 +607,7 @@ public async Task BatchJobContainsExpectedBatchPoolInformation() GetMockAllowedVms(config)); var batchScheduler = serviceProvider.GetT(); - await batchScheduler.ProcessTesTaskAsync(tesTask, System.Threading.CancellationToken.None); + await foreach (var _ in batchScheduler.ProcessQueuedTesTasksAsync(new[] { tesTask }, System.Threading.CancellationToken.None)); var createBatchPoolAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.CreateBatchPoolAsync)); var addBatchTaskAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.AddBatchTaskAsync)); @@ -921,18 +921,18 @@ public async Task SuccessfullyCompletedTaskContainsBatchNodeMetrics() BlobXferPullStart=2020-10-08T02:30:39+00:00 BlobXferPullEnd=2020-10-08T02:31:39+00:00 ExecutorPullStart=2020-10-08T02:32:39+00:00 - ExecutorPullEnd=2020-10-08T02:34:39+00:00 ExecutorImageSizeInBytes=3000000000 + ExecutorPullEnd=2020-10-08T02:34:39+00:00 DownloadStart=2020-10-08T02:35:39+00:00 + FileDownloadSizeInBytes=2000000000 DownloadEnd=2020-10-08T02:38:39+00:00 ExecutorStart=2020-10-08T02:39:39+00:00 ExecutorEnd=2020-10-08T02:43:39+00:00 UploadStart=2020-10-08T02:44:39+00:00 + FileUploadSizeInBytes=4000000000 UploadEnd=2020-10-08T02:49:39+00:00 DiskSizeInKiB=8000000 - DiskUsedInKiB=1000000 - FileDownloadSizeInBytes=2000000000 - FileUploadSizeInBytes=4000000000".Replace(" ", string.Empty); + DiskUsedInKiB=1000000".Replace(" ", string.Empty); var azureProxyReturnValues = AzureProxyReturnValues.Defaults; azureProxyReturnValues.BatchJobAndTaskState = BatchJobAndTaskStates.TaskCompletedSuccessfully; @@ -1532,7 +1532,15 @@ public async Task var batchScheduler = serviceProvider.GetT(); serviceProviderActions?.Invoke(serviceProvider); - await batchScheduler.ProcessTesTaskAsync(tesTask, System.Threading.CancellationToken.None); + await foreach (var _ in tesTask.State switch + { + TesState.CANCELINGEnum => batchScheduler.ProcessCancelledTesTasksAsync(new[] { tesTask }, System.Threading.CancellationToken.None), + TesState.QUEUEDEnum => batchScheduler.ProcessQueuedTesTasksAsync(new[] { tesTask }, System.Threading.CancellationToken.None), + TesState.COMPLETEEnum => batchScheduler.ProcessCompletedTesTasksAsync(new[] { tesTask }, new[] { new CloudTask(tesTask.Id, "") }, System.Threading.CancellationToken.None), + TesState.EXECUTORERROREnum => batchScheduler.ProcessCompletedTesTasksAsync(new[] { tesTask }, new[] { new CloudTask(tesTask.Id, "") }, System.Threading.CancellationToken.None), + TesState.SYSTEMERROREnum => batchScheduler.ProcessCompletedTesTasksAsync(new[] { tesTask }, new[] { new CloudTask(tesTask.Id, "") }, System.Threading.CancellationToken.None), + _ => batchScheduler.ProcessTerminatedTesTasksAsync(new[] { tesTask }, System.Threading.CancellationToken.None), + }) { } var createBatchPoolAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.CreateBatchPoolAsync)); var addBatchTaskAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.AddBatchTaskAsync)); diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 47a3de114..6c2bed23c 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -12,6 +12,7 @@ using System.Threading.Tasks; using Microsoft.Azure.Batch; using Microsoft.Azure.Batch.Common; +using Microsoft.Azure.Management.ApplicationInsights.Management; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Newtonsoft.Json; @@ -67,7 +68,7 @@ public partial class BatchScheduler : IBatchScheduler private readonly IStorageAccessProvider storageAccessProvider; private readonly IBatchQuotaVerifier quotaVerifier; private readonly IBatchSkuInformationProvider skuInformationProvider; - private readonly List tesTaskStateTransitions; + //private readonly List tesTaskStateTransitions; private readonly bool usePreemptibleVmsOnly; private readonly string batchNodesSubnetId; private readonly bool disableBatchNodesPublicIpAddress; @@ -87,8 +88,6 @@ public partial class BatchScheduler : IBatchScheduler private readonly string[] taskCleanupScriptContent; private readonly IAllowedVmSizesService allowedVmSizesService; - private HashSet onlyLogBatchTaskStateOnce = new(); - /// /// Orchestrates s on Azure Batch /// @@ -181,12 +180,12 @@ public BatchScheduler( logger.LogInformation($"usePreemptibleVmsOnly: {usePreemptibleVmsOnly}"); - static bool tesTaskIsQueuedInitializingOrRunning(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum || tesTask.State == TesState.INITIALIZINGEnum || tesTask.State == TesState.RUNNINGEnum; - static bool tesTaskIsInitializingOrRunning(TesTask tesTask) => tesTask.State == TesState.INITIALIZINGEnum || tesTask.State == TesState.RUNNINGEnum; - static bool tesTaskIsQueuedOrInitializing(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum || tesTask.State == TesState.INITIALIZINGEnum; - static bool tesTaskIsQueued(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum; - static bool tesTaskCancellationRequested(TesTask tesTask) => tesTask.State == TesState.CANCELINGEnum; - static bool tesTaskDeletionReady(TesTask tesTask) => tesTask.IsTaskDeletionRequired; + //static bool tesTaskIsQueuedInitializingOrRunning(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum || tesTask.State == TesState.INITIALIZINGEnum || tesTask.State == TesState.RUNNINGEnum; + //static bool tesTaskIsInitializingOrRunning(TesTask tesTask) => tesTask.State == TesState.INITIALIZINGEnum || tesTask.State == TesState.RUNNINGEnum; + //static bool tesTaskIsQueuedOrInitializing(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum || tesTask.State == TesState.INITIALIZINGEnum; + //static bool tesTaskIsQueued(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum; + //static bool tesTaskCancellationRequested(TesTask tesTask) => tesTask.State == TesState.CANCELINGEnum; + //static bool tesTaskDeletionReady(TesTask tesTask) => tesTask.IsTaskDeletionRequired; static void SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo) { @@ -268,25 +267,25 @@ Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, return Task.FromResult(false); } - tesTaskStateTransitions = new List() - { - new TesTaskStateTransition(tesTaskDeletionReady, batchTaskState: null, alternateSystemLogItem: null, DeleteCancelledTaskAsync), - new TesTaskStateTransition(tesTaskCancellationRequested, batchTaskState: null, alternateSystemLogItem: null, CancelTaskAsync), - new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.JobNotFound, alternateSystemLogItem: null, (tesTask, _, ct) => AddBatchTaskAsync(tesTask, ct)), - new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.MissingBatchTask, alternateSystemLogItem: null, (tesTask, batchInfo, ct) => AddBatchTaskAsync(tesTask, ct)), - new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.Initializing, alternateSystemLogItem: null, (tesTask, _) => tesTask.State = TesState.INITIALIZINGEnum), - new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, BatchTaskState.NodeAllocationFailed, alternateSystemLogItem: null, DeleteBatchJobAndRequeueTaskAsync), - new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, BatchTaskState.Running, alternateSystemLogItem: null, (tesTask, _) => tesTask.State = TesState.RUNNINGEnum), - new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.MoreThanOneActiveJobOrTaskFound, BatchTaskState.MoreThanOneActiveJobOrTaskFound.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), - new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.CompletedSuccessfully, alternateSystemLogItem: null, SetTaskCompleted), - new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.CompletedWithErrors, "Please open an issue. There should have been an error reported here.", SetTaskExecutorError), - new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.ActiveJobWithMissingAutoPool, alternateSystemLogItem: null, DeleteBatchJobAndRequeueTaskAsync), - new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.NodeFailedDuringStartupOrExecution, "Please open an issue. There should have been an error reported here.", DeleteBatchJobAndSetTaskExecutorErrorAsync), - new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.NodeUnusable, "Please open an issue. There should have been an error reported here.", DeleteBatchJobAndSetTaskExecutorErrorAsync), - new TesTaskStateTransition(tesTaskIsInitializingOrRunning, BatchTaskState.JobNotFound, BatchTaskState.JobNotFound.ToString(), SetTaskSystemError), - new TesTaskStateTransition(tesTaskIsInitializingOrRunning, BatchTaskState.MissingBatchTask, BatchTaskState.MissingBatchTask.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), - new TesTaskStateTransition(tesTaskIsInitializingOrRunning, BatchTaskState.NodePreempted, alternateSystemLogItem: null, HandlePreemptedNodeAsync) - }; + //tesTaskStateTransitions = new List() + //{ + // new TesTaskStateTransition(tesTaskDeletionReady, batchTaskState: null, alternateSystemLogItem: null, DeleteCancelledTaskAsync), + // new TesTaskStateTransition(tesTaskCancellationRequested, batchTaskState: null, alternateSystemLogItem: null, CancelTaskAsync), + // new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.JobNotFound, alternateSystemLogItem: null, (tesTask, _, ct) => AddBatchTaskAsync(tesTask, ct)), + // new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.MissingBatchTask, alternateSystemLogItem: null, (tesTask, batchInfo, ct) => AddBatchTaskAsync(tesTask, ct)), + // new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.Initializing, alternateSystemLogItem: null, (tesTask, _) => tesTask.State = TesState.INITIALIZINGEnum), + // new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, BatchTaskState.NodeAllocationFailed, alternateSystemLogItem: null, DeleteBatchJobAndRequeueTaskAsync), + // new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, BatchTaskState.Running, alternateSystemLogItem: null, (tesTask, _) => tesTask.State = TesState.RUNNINGEnum), + // new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.MoreThanOneActiveJobOrTaskFound, BatchTaskState.MoreThanOneActiveJobOrTaskFound.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), + // new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.CompletedSuccessfully, alternateSystemLogItem: null, SetTaskCompleted), + // new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.CompletedWithErrors, "Please open an issue. There should have been an error reported here.", SetTaskExecutorError), + // new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.ActiveJobWithMissingAutoPool, alternateSystemLogItem: null, DeleteBatchJobAndRequeueTaskAsync), + // new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.NodeFailedDuringStartupOrExecution, "Please open an issue. There should have been an error reported here.", DeleteBatchJobAndSetTaskExecutorErrorAsync), + // new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.NodeUnusable, "Please open an issue. There should have been an error reported here.", DeleteBatchJobAndSetTaskExecutorErrorAsync), + // new TesTaskStateTransition(tesTaskIsInitializingOrRunning, BatchTaskState.JobNotFound, BatchTaskState.JobNotFound.ToString(), SetTaskSystemError), + // new TesTaskStateTransition(tesTaskIsInitializingOrRunning, BatchTaskState.MissingBatchTask, BatchTaskState.MissingBatchTask.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), + // new TesTaskStateTransition(tesTaskIsInitializingOrRunning, BatchTaskState.NodePreempted, alternateSystemLogItem: null, HandlePreemptedNodeAsync) + //}; } private async Task DeleteCancelledTaskAsync(TesTask task, CombinedBatchTaskInfo info, CancellationToken token) @@ -362,40 +361,48 @@ public async Task UploadTaskRunnerIfNeeded(CancellationToken cancellationToken) } } - /// - /// Iteratively manages execution of a on Azure Batch until completion or failure - /// - /// The - /// A for controlling the lifetime of the asynchronous operation. - /// True if the TES task needs to be persisted. - public async ValueTask ProcessTesTaskAsync(TesTask tesTask, CancellationToken cancellationToken) - { - var combinedBatchTaskInfo = await GetBatchTaskStateAsync(tesTask, cancellationToken); - const string template = "TES task: {TesTask} TES task state: {TesTaskState} BatchTaskState: {BatchTaskState}"; - var msg = string.Format(ConvertTemplateToFormat(template), tesTask.Id, tesTask.State.ToString(), combinedBatchTaskInfo.BatchTaskState.ToString()); - - if (onlyLogBatchTaskStateOnce.Add(msg)) - { - logger.LogInformation(template, tesTask.Id, tesTask.State.ToString(), combinedBatchTaskInfo.BatchTaskState.ToString()); - } + ///// + ///// Iteratively manages execution of a on Azure Batch until completion or failure + ///// + ///// The + ///// A for controlling the lifetime of the asynchronous operation. + ///// True if the TES task needs to be persisted. + //public async ValueTask ProcessTesTaskAsync(TesTask tesTask, CancellationToken cancellationToken) + //{ + // var combinedBatchTaskInfo = await GetBatchTaskStateAsync(tesTask, cancellationToken); + // const string template = "TES task: {TesTask} TES task state: {TesTaskState} BatchTaskState: {BatchTaskState}"; + // var msg = string.Format(ConvertTemplateToFormat(template), tesTask.Id, tesTask.State.ToString(), combinedBatchTaskInfo.BatchTaskState.ToString()); + + // if (onlyLogBatchTaskStateOnce.Add(msg)) + // { + // logger.LogInformation(template, tesTask.Id, tesTask.State.ToString(), combinedBatchTaskInfo.BatchTaskState.ToString()); + // } + + // return await HandleTesTaskTransitionAsync(tesTask, combinedBatchTaskInfo, cancellationToken); + + // static string ConvertTemplateToFormat(string template) + // => string.Join(null, template.Split('{', '}').Select((s, i) => (s, i)).Select(t => t.i % 2 == 0 ? t.s : $"{{{t.i / 2}}}")); + //} - return await HandleTesTaskTransitionAsync(tesTask, combinedBatchTaskInfo, cancellationToken); + /// + public async IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCompletedTesTasksAsync(IEnumerable tesTask, CloudTask[] cloudTask, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + { + yield break; + } - static string ConvertTemplateToFormat(string template) - => string.Join(null, template.Split('{', '}').Select((s, i) => (s, i)).Select(t => t.i % 2 == 0 ? t.s : $"{{{t.i / 2}}}")); + /// + public async IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCancelledTesTasksAsync(IEnumerable tesTask, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + { + yield break; } - /// - /// Garbage collects the old batch task state log hashset - /// - public void ClearBatchLogState() + /// + public async IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessTerminatedTesTasksAsync(IEnumerable tesTask, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { - if (onlyLogBatchTaskStateOnce.Count > 0) - { - onlyLogBatchTaskStateOnce = new(); - } + yield break; } + private static string GetCromwellExecutionDirectoryPathAsUrl(TesTask task) { var commandScript = task.Inputs?.FirstOrDefault(IsCromwellCommandScript); @@ -478,90 +485,144 @@ private bool TryGetCromwellTmpFilePath(string fileUri, out string localPath) return localPath is not null; } - /// - /// Adds a new Azure Batch pool/job/task for the given - /// - /// The to schedule on Azure Batch - /// A for controlling the lifetime of the asynchronous operation. - /// A task to await - private async Task AddBatchTaskAsync(TesTask tesTask, CancellationToken cancellationToken) + /// + public async IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { - PoolInformation poolInformation = null; - string poolKey = null; + var taskMetadataByPools = new Dictionary>(); + var poolKeyByTasks = new Dictionary(); + var tasks = tesTasks.ToList(); - try + // Determe how many nodes in each pool we might need for this group. + foreach (var tesTask in tesTasks) { - var virtualMachineInfo = await GetVmSizeAsync(tesTask, cancellationToken); + Task quickResult = default; + string poolKey = default; - var containerMetadata = await GetContainerConfigurationIfNeededAsync(tesTask, cancellationToken); - (poolKey, var displayName) = GetPoolKey(tesTask, virtualMachineInfo, containerMetadata.ContainerConfiguration, cancellationToken); - await quotaVerifier.CheckBatchAccountQuotasAsync(virtualMachineInfo, needPoolOrJobQuotaCheck: !IsPoolAvailable(poolKey), cancellationToken: cancellationToken); + try + { + var virtualMachineInfo = await GetVmSizeAsync(tesTask, cancellationToken); + var containerMetadata = await GetContainerConfigurationIfNeededAsync(tesTask, cancellationToken); + (poolKey, var displayName) = GetPoolKey(tesTask, virtualMachineInfo, containerMetadata.ContainerConfiguration, cancellationToken); + await quotaVerifier.CheckBatchAccountQuotasAsync(virtualMachineInfo, needPoolOrJobQuotaCheck: !IsPoolAvailable(poolKey), cancellationToken: cancellationToken); - var tesTaskLog = tesTask.AddTesTaskLog(); - tesTaskLog.VirtualMachineInfo = virtualMachineInfo; - var identities = new List(); + if (taskMetadataByPools.TryGetValue(poolKey, out var resource)) + { + resource.Add((tesTask, virtualMachineInfo, containerMetadata, displayName)); + } + else + { + taskMetadataByPools.Add(poolKey, new() { (tesTask, virtualMachineInfo, containerMetadata, displayName) }); + } - if (!string.IsNullOrWhiteSpace(globalManagedIdentity)) + poolKeyByTasks.Add(tesTask.Id, poolKey); + } + catch (Exception ex) { - identities.Add(globalManagedIdentity); + quickResult = HandleException(ex, poolKey, tesTask); } - if (tesTask.Resources?.ContainsBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity) == true) + if (quickResult is not null) { - identities.Add(tesTask.Resources?.GetBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity)); + tasks.Remove(tesTask); + yield return (tesTask, quickResult); } - - var useGen2 = virtualMachineInfo.HyperVGenerations?.Contains("V2"); - poolInformation = (await GetOrAddPoolAsync( - key: poolKey, - isPreemptable: virtualMachineInfo.LowPriority, - modelPoolFactory: async (id, ct) => ConvertPoolSpecificationToModelsPool( - name: id, - displayName: displayName, - poolIdentity: GetBatchPoolIdentity(identities.ToArray()), - pool: await GetPoolSpecification( - vmSize: virtualMachineInfo.VmSize, - autoscaled: true, - preemptable: virtualMachineInfo.LowPriority, - nodeInfo: useGen2.GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo, - containerConfiguration: containerMetadata.ContainerConfiguration, - encryptionAtHostSupported: virtualMachineInfo.EncryptionAtHostSupported, - cancellationToken: ct)), - cancellationToken: cancellationToken) - ).Pool; - - var jobOrTaskId = $"{tesTask.Id}-{tesTask.Logs.Count}"; - tesTask.PoolId = poolInformation.PoolId; - var cloudTask = await ConvertTesTaskToBatchTaskAsync(jobOrTaskId, tesTask, containerMetadata.IsPublic, cancellationToken); - - logger.LogInformation($"Creating batch task for TES task {tesTask.Id}. Using VM size {virtualMachineInfo.VmSize}."); - await azureProxy.AddBatchTaskAsync(tesTask.Id, cloudTask, poolInformation, cancellationToken); - - tesTaskLog.StartTime = DateTimeOffset.UtcNow; - tesTask.State = TesState.INITIALIZINGEnum; - poolInformation = null; } - catch (AggregateException aggregateException) + + var neededPoolCounts = taskMetadataByPools.ToDictionary(t => t.Key, t => t.Value.Count); + tesTasks = tasks.ToArray(); + + foreach (var tesTask in tesTasks) { - foreach (var exception in aggregateException.Flatten().InnerExceptions) + Task quickResult = default; + var poolKey = poolKeyByTasks[tesTask.Id]; + var (_, virtualMachineInfo, containerMetadata, displayName) = taskMetadataByPools[poolKey].Single(t => t.TesTask.Id == tesTask.Id); + + try + { + PoolInformation poolInformation = null; + var tesTaskLog = tesTask.AddTesTaskLog(); + tesTaskLog.VirtualMachineInfo = virtualMachineInfo; + var identities = new List(); + + if (!string.IsNullOrWhiteSpace(globalManagedIdentity)) + { + identities.Add(globalManagedIdentity); + } + + if (tesTask.Resources?.ContainsBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity) == true) + { + identities.Add(tesTask.Resources?.GetBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity)); + } + + var useGen2 = virtualMachineInfo.HyperVGenerations?.Contains("V2"); + poolInformation = (await GetOrAddPoolAsync( + key: poolKey, + isPreemptable: virtualMachineInfo.LowPriority, + modelPoolFactory: async (id, ct) => ConvertPoolSpecificationToModelsPool( + name: id, + displayName: displayName, + poolIdentity: GetBatchPoolIdentity(identities.ToArray()), + pool: await GetPoolSpecification( + vmSize: virtualMachineInfo.VmSize, + autoscaled: true, + preemptable: virtualMachineInfo.LowPriority, + nodeInfo: useGen2.GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo, + containerConfiguration: containerMetadata.ContainerConfiguration, + encryptionAtHostSupported: virtualMachineInfo.EncryptionAtHostSupported, + cancellationToken: ct)), + cancellationToken: cancellationToken) + ).Pool; + + var jobOrTaskId = $"{tesTask.Id}-{tesTask.Logs.Count}"; + tesTask.PoolId = poolInformation.PoolId; + var cloudTask = await ConvertTesTaskToBatchTaskAsync(jobOrTaskId, tesTask, containerMetadata.IsPublic, cancellationToken); + + logger.LogInformation($"Creating batch task for TES task {tesTask.Id}. Using VM size {virtualMachineInfo.VmSize}."); + await azureProxy.AddBatchTaskAsync(tesTask.Id, cloudTask, poolInformation, cancellationToken); + + tesTaskLog.StartTime = DateTimeOffset.UtcNow; + tesTask.State = TesState.INITIALIZINGEnum; + poolInformation = null; + } + catch (AggregateException aggregateException) { - HandleException(exception); + var exceptions = new List(); + + foreach (var partResult in aggregateException.Flatten().InnerExceptions.Select(ex => HandleException(ex, poolKey, tesTask))) + { + if (partResult.IsFaulted) + { + exceptions.Add(partResult.Exception); + } + } + + quickResult = exceptions.Count == 0 + ? Task.FromResult(true) + : Task.FromException(new AggregateException(exceptions)); + } + catch (Exception exception) + { + quickResult = HandleException(exception, poolKey, tesTask); + } + + if (quickResult is not null) + { + yield return (tesTask, quickResult); + } + else + { + yield return (tesTask, Task.FromResult(true)); } - } - catch (Exception exception) - { - HandleException(exception); } - void HandleException(Exception exception) + Task HandleException(Exception exception, string poolKey, TesTask tesTask) { switch (exception) { case AzureBatchPoolCreationException azureBatchPoolCreationException: if (!azureBatchPoolCreationException.IsTimeout && !azureBatchPoolCreationException.IsJobQuota && !azureBatchPoolCreationException.IsPoolQuota && azureBatchPoolCreationException.InnerException is not null) { - HandleException(azureBatchPoolCreationException.InnerException); - return; + return HandleException(azureBatchPoolCreationException.InnerException, poolKey, tesTask); } logger.LogWarning(azureBatchPoolCreationException, "TES task: {TesTask} AzureBatchPoolCreationException.Message: {ExceptionMessage}. This might be a transient issue. Task will remain with state QUEUED. Confirmed timeout: {ConfirmedTimeout}", tesTask.Id, azureBatchPoolCreationException.Message, azureBatchPoolCreationException.IsTimeout); @@ -635,6 +696,8 @@ void HandleException(Exception exception) logger.LogError(exception, "TES task: {TesTask} Exception: {ExceptionType}: {ExceptionMessage}", tesTask.Id, exception?.GetType().FullName, exception?.Message); break; } + + return Task.FromResult(true); } } @@ -872,27 +935,27 @@ bool ProcessStartTaskFailure(TaskFailureInformation failureInformation) } } - /// - /// Transitions the to the new state, based on the rules defined in the tesTaskStateTransitions list. - /// - /// TES task - /// Current Azure Batch task info - /// A for controlling the lifetime of the asynchronous operation. - /// True if the TES task was changed. - // When task is executed the following may be touched: - // tesTask.Log[].SystemLog - // tesTask.Log[].FailureReason - // tesTask.Log[].CromwellResultCode - // tesTask.Log[].BatchExecutionMetrics - // tesTask.Log[].EndTime - // tesTask.Log[].Log[].StdErr - // tesTask.Log[].Log[].ExitCode - // tesTask.Log[].Log[].StartTime - // tesTask.Log[].Log[].EndTime - private ValueTask HandleTesTaskTransitionAsync(TesTask tesTask, CombinedBatchTaskInfo combinedBatchTaskInfo, CancellationToken cancellationToken) - => (tesTaskStateTransitions - .FirstOrDefault(m => (m.Condition is null || m.Condition(tesTask)) && (m.CurrentBatchTaskState is null || m.CurrentBatchTaskState == combinedBatchTaskInfo.BatchTaskState)) - ?.ActionAsync(tesTask, combinedBatchTaskInfo, cancellationToken) ?? ValueTask.FromResult(false)); + ///// + ///// Transitions the to the new state, based on the rules defined in the tesTaskStateTransitions list. + ///// + ///// TES task + ///// Current Azure Batch task info + ///// A for controlling the lifetime of the asynchronous operation. + ///// True if the TES task was changed. + //// When task is executed the following may be touched: + //// tesTask.Log[].SystemLog + //// tesTask.Log[].FailureReason + //// tesTask.Log[].CromwellResultCode + //// tesTask.Log[].BatchExecutionMetrics + //// tesTask.Log[].EndTime + //// tesTask.Log[].Log[].StdErr + //// tesTask.Log[].Log[].ExitCode + //// tesTask.Log[].Log[].StartTime + //// tesTask.Log[].Log[].EndTime + //private ValueTask HandleTesTaskTransitionAsync(TesTask tesTask, CombinedBatchTaskInfo combinedBatchTaskInfo, CancellationToken cancellationToken) + // => (tesTaskStateTransitions + // .FirstOrDefault(m => (m.Condition is null || m.Condition(tesTask)) && (m.CurrentBatchTaskState is null || m.CurrentBatchTaskState == combinedBatchTaskInfo.BatchTaskState)) + // ?.ActionAsync(tesTask, combinedBatchTaskInfo, cancellationToken) ?? ValueTask.FromResult(false)); /// /// Returns job preparation and main Batch tasks that represents the given @@ -1862,63 +1925,63 @@ private static Dictionary DelimitedTextToDictionary(string text, .Select(line => { var parts = line.Split(fieldDelimiter); return new KeyValuePair(parts[0], parts[1]); }) .ToDictionary(kv => kv.Key, kv => kv.Value); - /// - /// Class that captures how transitions from current state to the new state, given the current Batch task state and optional condition. - /// Transitions typically include an action that needs to run in order for the task to move to the new state. - /// - private class TesTaskStateTransition - { - public TesTaskStateTransition(Func condition, BatchTaskState? batchTaskState, string alternateSystemLogItem, Func asyncAction) - : this(condition, batchTaskState, alternateSystemLogItem, asyncAction, null) - { } - - public TesTaskStateTransition(Func condition, BatchTaskState? batchTaskState, string alternateSystemLogItem, Action action) - : this(condition, batchTaskState, alternateSystemLogItem, null, action) - { - } - - private TesTaskStateTransition(Func condition, BatchTaskState? batchTaskState, string alternateSystemLogItem, Func asyncAction, Action action) - { - Condition = condition; - CurrentBatchTaskState = batchTaskState; - AlternateSystemLogItem = alternateSystemLogItem; - AsyncAction = asyncAction; - Action = action; - } - - public Func Condition { get; } - public BatchTaskState? CurrentBatchTaskState { get; } - private string AlternateSystemLogItem { get; } - private Func AsyncAction { get; } - private Action Action { get; } - - /// - /// Calls and/or . - /// - /// - /// - /// A for controlling the lifetime of the asynchronous operation. - /// True an action was called, otherwise False. - public async ValueTask ActionAsync(TesTask tesTask, CombinedBatchTaskInfo combinedBatchTaskInfo, CancellationToken cancellationToken) - { - combinedBatchTaskInfo.AlternateSystemLogItem = AlternateSystemLogItem; - var tesTaskChanged = false; - - if (AsyncAction is not null) - { - await AsyncAction(tesTask, combinedBatchTaskInfo, cancellationToken); - tesTaskChanged = true; - } - - if (Action is not null) - { - Action(tesTask, combinedBatchTaskInfo); - tesTaskChanged = true; - } - - return tesTaskChanged; - } - } + ///// + ///// Class that captures how transitions from current state to the new state, given the current Batch task state and optional condition. + ///// Transitions typically include an action that needs to run in order for the task to move to the new state. + ///// + //private class TesTaskStateTransition + //{ + // public TesTaskStateTransition(Func condition, BatchTaskState? batchTaskState, string alternateSystemLogItem, Func asyncAction) + // : this(condition, batchTaskState, alternateSystemLogItem, asyncAction, null) + // { } + + // public TesTaskStateTransition(Func condition, BatchTaskState? batchTaskState, string alternateSystemLogItem, Action action) + // : this(condition, batchTaskState, alternateSystemLogItem, null, action) + // { + // } + + // private TesTaskStateTransition(Func condition, BatchTaskState? batchTaskState, string alternateSystemLogItem, Func asyncAction, Action action) + // { + // Condition = condition; + // CurrentBatchTaskState = batchTaskState; + // AlternateSystemLogItem = alternateSystemLogItem; + // AsyncAction = asyncAction; + // Action = action; + // } + + // public Func Condition { get; } + // public BatchTaskState? CurrentBatchTaskState { get; } + // private string AlternateSystemLogItem { get; } + // private Func AsyncAction { get; } + // private Action Action { get; } + + // /// + // /// Calls and/or . + // /// + // /// + // /// + // /// A for controlling the lifetime of the asynchronous operation. + // /// True an action was called, otherwise False. + // public async ValueTask ActionAsync(TesTask tesTask, CombinedBatchTaskInfo combinedBatchTaskInfo, CancellationToken cancellationToken) + // { + // combinedBatchTaskInfo.AlternateSystemLogItem = AlternateSystemLogItem; + // var tesTaskChanged = false; + + // if (AsyncAction is not null) + // { + // await AsyncAction(tesTask, combinedBatchTaskInfo, cancellationToken); + // tesTaskChanged = true; + // } + + // if (Action is not null) + // { + // Action(tesTask, combinedBatchTaskInfo); + // tesTaskChanged = true; + // } + + // return tesTaskChanged; + // } + //} private class ExternalStorageContainerInfo { diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index 663ff04f1..9f44572b2 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -36,12 +36,39 @@ public interface IBatchScheduler Task UploadTaskRunnerIfNeeded(CancellationToken cancellationToken); /// - /// Iteratively schedule a on a batch system until completion or failure + /// Schedule queued s on a batch system /// - /// to schedule on the batch system + /// s to schedule on the batch system. /// A for controlling the lifetime of the asynchronous operation. - /// Whether the was modified. - ValueTask ProcessTesTaskAsync(TesTask tesTask, CancellationToken cancellationToken); + /// True for each corresponding that needs to be persisted. + IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessQueuedTesTasksAsync(TesTask[] tesTask, CancellationToken cancellationToken); + + /// + /// Finalize completed s on a batch system + /// + /// s to schedule on the batch system. + /// s corresponding to each . + /// A for controlling the lifetime of the asynchronous operation. + /// True for each corresponding that needs to be persisted. + IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCompletedTesTasksAsync(IEnumerable tesTask, CloudTask[] cloudTask, CancellationToken cancellationToken); + + /// + /// Terminate cancelled s on a batch system + /// + /// s to schedule on the batch system. + /// A for controlling the lifetime of the asynchronous operation. + /// True for each corresponding that needs to be persisted. + IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCancelledTesTasksAsync(IEnumerable tesTask, CancellationToken cancellationToken); + + /// + /// Delete terminated s on a batch system + /// + /// s to schedule on the batch system. + /// A for controlling the lifetime of the asynchronous operation. + /// True for each corresponding that needs to be persisted. + IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessTerminatedTesTasksAsync(IEnumerable tesTask, CancellationToken cancellationToken); + + //TODO: Add other task update methods here. /// /// Adds to the managed batch pools. @@ -78,11 +105,6 @@ public interface IBatchScheduler /// bool RemovePoolFromList(IBatchPool pool); - /// - /// Garbage collects the old batch task state log hashset - /// - void ClearBatchLogState(); - /// /// Flushes empty pools to accomodate pool quota limits. /// diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index a356b0141..db6531e73 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -66,11 +66,73 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) logger.LogInformation("Scheduler started."); + await Task.WhenAll(ExecuteCancelledTesTasksOnBatch(stoppingToken), ExecuteQueuedTesTasksOnBatch(stoppingToken), ExecuteTerminatedTesTasksOnBatch(stoppingToken)); + + logger.LogInformation("Scheduler gracefully stopped."); + } + + /// + /// Retrieves all queued TES tasks from the database, performs an action in the batch system, and updates the resultant state + /// + /// + private async Task ExecuteQueuedTesTasksOnBatch(CancellationToken stoppingToken) + { + await ExecuteTesTasksOnBatch( + async () => (await repository.GetItemsAsync( + predicate: t => t.State == TesState.QUEUEDEnum, + cancellationToken: stoppingToken)) + .OrderBy(t => t.CreationTime) + .ToAsyncEnumerable(), + + tasks => batchScheduler.ProcessQueuedTesTasksAsync(tasks, stoppingToken), + stoppingToken); + } + + /// + /// Retrieves all cancelled TES tasks from the database, performs an action in the batch system, and updates the resultant state + /// + /// + private async Task ExecuteCancelledTesTasksOnBatch(CancellationToken stoppingToken) + { + await ExecuteTesTasksOnBatch( + async () => (await repository.GetItemsAsync( + predicate: t => t.State == TesState.CANCELINGEnum, + cancellationToken: stoppingToken)) + .OrderBy(t => t.CreationTime) + .ToAsyncEnumerable(), + + tasks => batchScheduler.ProcessCancelledTesTasksAsync(tasks, stoppingToken), + stoppingToken); + } + + /// + /// Retrieves all terminated TES tasks from the database, performs an action in the batch system, and updates the resultant state + /// + /// + private async Task ExecuteTerminatedTesTasksOnBatch(CancellationToken stoppingToken) + { + await ExecuteTesTasksOnBatch( + async () => (await repository.GetItemsAsync( + predicate: t => t.IsTaskDeletionRequired, + cancellationToken: stoppingToken)) + .OrderBy(t => t.CreationTime) + .ToAsyncEnumerable(), + + tasks => batchScheduler.ProcessTerminatedTesTasksAsync(tasks, stoppingToken), + stoppingToken); + } + + /// + /// Retrieves provided actionable TES tasks from the database, performs an action in the batch system, and updates the resultant state + /// + /// + private async ValueTask ExecuteTesTasksOnBatch(Func>> tesTaskGetter, Func IsModified)>> tesTaskProcessor, CancellationToken stoppingToken) + { while (!stoppingToken.IsCancellationRequested) { try { - await OrchestrateTesTasksOnBatch(stoppingToken); + await OrchestrateTesTasksOnBatch(tesTaskGetter, tesTaskProcessor, stoppingToken); } catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested) { @@ -90,40 +152,32 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) break; } } - - logger.LogInformation("Scheduler gracefully stopped."); } /// - /// Retrieves all actionable TES tasks from the database, performs an action in the batch system, and updates the resultant state + /// Retrieves provided actionable TES tasks from the database, performs an action in the batch system, and updates the resultant state /// /// - private async ValueTask OrchestrateTesTasksOnBatch(CancellationToken stoppingToken) + private async ValueTask OrchestrateTesTasksOnBatch(Func>> tesTaskGetter, Func IsModified)>> tesTaskProcessor, CancellationToken stoppingToken) { var pools = new HashSet(); + var tesTasks = await (await tesTaskGetter()).ToArrayAsync(stoppingToken); - var tesTasks = (await repository.GetItemsAsync( - predicate: t => t.State == TesState.QUEUEDEnum || t.State == TesState.CANCELINGEnum || t.IsTaskDeletionRequired, - cancellationToken: stoppingToken)) - .OrderBy(t => t.CreationTime) - .ToList(); - - if (0 == tesTasks.Count) + if (0 == tesTasks.Length) { - batchScheduler.ClearBatchLogState(); return; } var startTime = DateTime.UtcNow; - foreach (var tesTask in tesTasks) + await foreach (var (tesTask, waitableResult) in tesTaskProcessor(tesTasks).WithCancellation(stoppingToken)) { try { var isModified = false; try { - isModified = await batchScheduler.ProcessTesTaskAsync(tesTask, stoppingToken); + isModified = await waitableResult; } catch (Exception exc) { @@ -201,6 +255,8 @@ private async ValueTask OrchestrateTesTasksOnBatch(CancellationToken stoppingTok catch (RepositoryCollisionException exc) { logger.LogError(exc, $"RepositoryCollisionException in OrchestrateTesTasksOnBatch"); + //TODO: retrieve fresh task if possible and add logs to the task in a similar way to the commanted out code block below. + //Also: consider doing the same in the other place(s) this exception is caught. } // TODO catch EF / postgres exception? //catch (Microsoft.Azure.Cosmos.CosmosException exc) @@ -240,7 +296,7 @@ private async ValueTask OrchestrateTesTasksOnBatch(CancellationToken stoppingTok await batchScheduler.FlushPoolsAsync(pools, stoppingToken); } - logger.LogDebug("OrchestrateTesTasksOnBatch for {TaskCount} tasks completed in {TotalSeconds} seconds.", tesTasks.Count, DateTime.UtcNow.Subtract(startTime).TotalSeconds); + //logger.LogDebug("OrchestrateTesTasksOnBatch for {TaskCount} tasks completed in {TotalSeconds} seconds.", tesTasks.Count, DateTime.UtcNow.Subtract(startTime).TotalSeconds); } } } From 02f9fa7ced7b73cff7985a3bf2fd40b957ec1287 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 27 Sep 2023 18:37:09 -0700 Subject: [PATCH 005/202] Reimplement canceling tasks in batch --- src/TesApi.Tests/BatchSchedulerTests.cs | 6 +- src/TesApi.Web/AzureProxy.cs | 8 +- src/TesApi.Web/BatchScheduler.cs | 80 +++++++++++++------ .../CachingWithRetriesAzureProxy.cs | 4 +- src/TesApi.Web/IAzureProxy.cs | 8 +- src/TesApi.Web/IBatchScheduler.cs | 16 ++-- 6 files changed, 77 insertions(+), 45 deletions(-) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 2adb6605e..ab1844b6a 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -885,14 +885,14 @@ public async Task TaskGetsCancelled() { Assert.AreEqual(TesState.CANCELEDEnum, tesTask.State); Assert.IsTrue(tesTask.IsTaskDeletionRequired); - azureProxy.Verify(i => i.TerminateBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); + azureProxy.Verify(i => i.TerminateBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); }); } [TestMethod] public async Task CancelledTaskGetsDeleted() { - var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELEDEnum, IsTaskDeletionRequired = true }; + var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELEDEnum, IsTaskDeletionRequired = true, Logs = new() { new() { StartTime = DateTimeOffset.UtcNow - TimeSpan.FromMinutes(11) } } }; var azureProxyReturnValues = AzureProxyReturnValues.Defaults; azureProxyReturnValues.BatchJobAndTaskState = BatchJobAndTaskStates.TaskActive; @@ -908,7 +908,7 @@ public async Task CancelledTaskGetsDeleted() GuardAssertsWithTesTask(tesTask, () => { Assert.IsFalse(tesTask.IsTaskDeletionRequired); - azureProxy.Verify(i => i.DeleteBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); + azureProxy.Verify(i => i.DeleteBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); }); } diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 1614a6aef..4d4699d3e 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -397,7 +397,7 @@ public async Task DeleteBatchJobAsync(string tesTaskId, CancellationToken cancel } /// - public async Task TerminateBatchTaskAsync(string tesTaskId, PoolInformation pool, CancellationToken cancellationToken) + public async Task TerminateBatchTaskAsync(string tesTaskId, string jobId, CancellationToken cancellationToken) { var jobFilter = new ODATADetailLevel { @@ -409,7 +409,7 @@ public async Task TerminateBatchTaskAsync(string tesTaskId, PoolInformation pool try { - batchTasksToDelete = await batchClient.JobOperations.ListTasks(pool.PoolId, jobFilter).ToAsyncEnumerable().ToListAsync(cancellationToken); + batchTasksToDelete = await batchClient.JobOperations.ListTasks(jobId, jobFilter).ToAsyncEnumerable().ToListAsync(cancellationToken); } catch (BatchException ex) when (ex.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException bee && "JobNotFound".Equals(bee.Body?.Code, StringComparison.InvariantCultureIgnoreCase)) { @@ -430,7 +430,7 @@ public async Task TerminateBatchTaskAsync(string tesTaskId, PoolInformation pool } /// - public async Task DeleteBatchTaskAsync(string tesTaskId, PoolInformation pool, CancellationToken cancellationToken) + public async Task DeleteBatchTaskAsync(string tesTaskId, string jobId, CancellationToken cancellationToken) { var jobFilter = new ODATADetailLevel { @@ -442,7 +442,7 @@ public async Task DeleteBatchTaskAsync(string tesTaskId, PoolInformation pool, C try { - batchTasksToDelete = await batchClient.JobOperations.ListTasks(pool.PoolId, jobFilter).ToAsyncEnumerable().ToListAsync(cancellationToken); + batchTasksToDelete = await batchClient.JobOperations.ListTasks(jobId, jobFilter).ToAsyncEnumerable().ToListAsync(cancellationToken); } catch (BatchException ex) when (ex.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException bee && "JobNotFound".Equals(bee.Body?.Code, StringComparison.InvariantCultureIgnoreCase)) { diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 6c2bed23c..1eb771b8d 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -12,7 +12,6 @@ using System.Threading.Tasks; using Microsoft.Azure.Batch; using Microsoft.Azure.Batch.Common; -using Microsoft.Azure.Management.ApplicationInsights.Management; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Newtonsoft.Json; @@ -219,25 +218,25 @@ static void SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CombinedB async Task SetTaskCompleted(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, batchInfo.Pool, cancellationToken); + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); SetTaskStateAndLog(tesTask, TesState.COMPLETEEnum, batchInfo); } async Task SetTaskExecutorError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, batchInfo.Pool, cancellationToken); + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); SetTaskStateAndLog(tesTask, TesState.EXECUTORERROREnum, batchInfo); } async Task SetTaskSystemError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, batchInfo.Pool, cancellationToken); + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); SetTaskStateAndLog(tesTask, TesState.SYSTEMERROREnum, batchInfo); } async Task DeleteBatchJobAndSetTaskStateAsync(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, batchInfo.Pool, cancellationToken); + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); SetTaskStateAndLog(tesTask, newTaskState, batchInfo); } @@ -255,11 +254,6 @@ Task AddSystemLogAndDeleteBatchJobAndSetTaskExecutorErrorAsync(TesTask tesTask, return DeleteBatchJobAndSetTaskExecutorErrorAsync(tesTask, batchInfo, cancellationToken); } - Task CancelTaskAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) - { - return TerminateBatchTaskAsync(tesTask, batchInfo, cancellationToken); - } - Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { logger.LogInformation("The TesTask {TesTask}'s node was preempted. It will be automatically rescheduled.", tesTask.Id); @@ -288,21 +282,32 @@ Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, //}; } - private async Task DeleteCancelledTaskAsync(TesTask task, CombinedBatchTaskInfo info, CancellationToken token) + private async Task DeleteCancelledTaskAsync(TesTask tesTask, CancellationToken cancellationToken) { - // TODO: check if task is old enough to delete. - await azureProxy.DeleteBatchTaskAsync(task.Id, info.Pool, token); - task.IsTaskDeletionRequired = false; + // https://learn.microsoft.com/azure/batch/best-practices#manage-task-lifetime + var mins10 = TimeSpan.FromMinutes(10); + var now = DateTimeOffset.UtcNow; + + if (!tesTask.Logs.Any(l => now - l.StartTime > mins10)) + { + return false; + } + + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + tesTask.IsTaskDeletionRequired = false; + return true; } - private async Task TerminateBatchTaskAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) + private async Task TerminateBatchTaskAsync(TesTask tesTask, CancellationToken cancellationToken) { try { - await azureProxy.TerminateBatchTaskAsync(tesTask.Id, batchInfo.Pool, cancellationToken); + await azureProxy.TerminateBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); tesTask.IsTaskDeletionRequired = true; tesTask.State = TesState.CANCELEDEnum; + return true; } + //TODO: catch exception returned if the task as already completed. catch (Exception exc) { logger.LogError(exc, "Exception terminating batch task with tesTask.Id: {TesTaskId}", tesTask?.Id); @@ -385,21 +390,49 @@ public async Task UploadTaskRunnerIfNeeded(CancellationToken cancellationToken) //} /// - public async IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCompletedTesTasksAsync(IEnumerable tesTask, CloudTask[] cloudTask, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + public async IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCompletedTesTasksAsync(IEnumerable tesTasks, CloudTask[] cloudTask, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { yield break; } /// - public async IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCancelledTesTasksAsync(IEnumerable tesTask, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + public async IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCancelledTesTasksAsync(IEnumerable tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { - yield break; + foreach (var task in tesTasks) + { + Task result; + + try + { + result = Task.FromResult(await TerminateBatchTaskAsync(task, cancellationToken)); + } + catch (Exception ex) + { + result = Task.FromException(ex); + } + + yield return (task, result); + } } /// - public async IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessTerminatedTesTasksAsync(IEnumerable tesTask, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + public async IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessTerminatedTesTasksAsync(IEnumerable tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { - yield break; + foreach (var task in tesTasks) + { + Task result; + + try + { + result = Task.FromResult(await DeleteCancelledTaskAsync(task, cancellationToken)); + } + catch (Exception ex) + { + result = Task.FromException(ex); + } + + yield return (task, result); + } } @@ -529,13 +562,11 @@ private bool TryGetCromwellTmpFilePath(string fileUri, out string localPath) } var neededPoolCounts = taskMetadataByPools.ToDictionary(t => t.Key, t => t.Value.Count); - tesTasks = tasks.ToArray(); - foreach (var tesTask in tesTasks) + foreach (var (tesTask, virtualMachineInfo, containerMetadata, displayName) in taskMetadataByPools.Values.SelectMany(e => e).Where(m => tasks.Contains(m.TesTask))) { Task quickResult = default; var poolKey = poolKeyByTasks[tesTask.Id]; - var (_, virtualMachineInfo, containerMetadata, displayName) = taskMetadataByPools[poolKey].Single(t => t.TesTask.Id == tesTask.Id); try { @@ -1171,6 +1202,7 @@ private async Task ConvertTesTaskToBatchTaskAsync(string taskId, TesT var cloudTask = new CloudTask(taskId, batchRunCommand) { + Constraints = new(maxWallClockTime: poolLifetime, retentionTime: TimeSpan.Zero, maxTaskRetryCount: 0), UserIdentity = new UserIdentity(new AutoUserSpecification(elevationLevel: ElevationLevel.Admin, scope: AutoUserScope.Pool)), OutputFiles = new List { diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 67babb914..3f804ad4a 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -94,10 +94,10 @@ public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, PoolI public Task DeleteBatchJobAsync(string taskId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(taskId, ct), cancellationToken); /// - public Task TerminateBatchTaskAsync(string taskId, PoolInformation poolInformation, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.TerminateBatchTaskAsync(taskId, poolInformation, ct), cancellationToken); + public Task TerminateBatchTaskAsync(string taskId, string jobId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.TerminateBatchTaskAsync(taskId, jobId, ct), cancellationToken); /// - public Task DeleteBatchTaskAsync(string taskId, PoolInformation poolInformation, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(taskId, poolInformation, ct), cancellationToken); + public Task DeleteBatchTaskAsync(string taskId, string jobId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(taskId, jobId, ct), cancellationToken); /// public Task DeleteBatchPoolAsync(string poolId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchPoolAsync(poolId, ct), cancellationToken); diff --git a/src/TesApi.Web/IAzureProxy.cs b/src/TesApi.Web/IAzureProxy.cs index 18bb587a1..f30e30a3c 100644 --- a/src/TesApi.Web/IAzureProxy.cs +++ b/src/TesApi.Web/IAzureProxy.cs @@ -76,17 +76,17 @@ public interface IAzureProxy /// Terminates an Azure Batch task /// /// The unique TES task ID - /// + /// The batch job that contains the task /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. - Task TerminateBatchTaskAsync(string taskId, PoolInformation poolInformation, CancellationToken cancellationToken); + Task TerminateBatchTaskAsync(string taskId, string jobId, CancellationToken cancellationToken); /// /// Deletes an Azure Batch task /// /// The unique TES task ID - /// + /// The batch job that contains the task /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. - Task DeleteBatchTaskAsync(string taskId, PoolInformation poolInformation, CancellationToken cancellationToken); + Task DeleteBatchTaskAsync(string taskId, string jobId, CancellationToken cancellationToken); /// /// Gets the counts of active batch nodes, grouped by VmSize diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index 9f44572b2..7c47f116b 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -38,35 +38,35 @@ public interface IBatchScheduler /// /// Schedule queued s on a batch system /// - /// s to schedule on the batch system. + /// s to schedule on the batch system. /// A for controlling the lifetime of the asynchronous operation. /// True for each corresponding that needs to be persisted. - IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessQueuedTesTasksAsync(TesTask[] tesTask, CancellationToken cancellationToken); + IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, CancellationToken cancellationToken); /// /// Finalize completed s on a batch system /// - /// s to schedule on the batch system. + /// s to schedule on the batch system. /// s corresponding to each . /// A for controlling the lifetime of the asynchronous operation. /// True for each corresponding that needs to be persisted. - IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCompletedTesTasksAsync(IEnumerable tesTask, CloudTask[] cloudTask, CancellationToken cancellationToken); + IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCompletedTesTasksAsync(IEnumerable tesTasks, CloudTask[] cloudTask, CancellationToken cancellationToken); /// /// Terminate cancelled s on a batch system /// - /// s to schedule on the batch system. + /// s to schedule on the batch system. /// A for controlling the lifetime of the asynchronous operation. /// True for each corresponding that needs to be persisted. - IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCancelledTesTasksAsync(IEnumerable tesTask, CancellationToken cancellationToken); + IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCancelledTesTasksAsync(IEnumerable tesTasks, CancellationToken cancellationToken); /// /// Delete terminated s on a batch system /// - /// s to schedule on the batch system. + /// s to schedule on the batch system. /// A for controlling the lifetime of the asynchronous operation. /// True for each corresponding that needs to be persisted. - IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessTerminatedTesTasksAsync(IEnumerable tesTask, CancellationToken cancellationToken); + IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessTerminatedTesTasksAsync(IEnumerable tesTasks, CancellationToken cancellationToken); //TODO: Add other task update methods here. From 49da0377a533155454a4a73f82bb2abf9f5087ca Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 27 Sep 2023 18:55:36 -0700 Subject: [PATCH 006/202] cleanup --- src/TesApi.Web/Scheduler.cs | 2 +- src/TesApi.Web/Startup.cs | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index db6531e73..679908670 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -296,7 +296,7 @@ private async ValueTask OrchestrateTesTasksOnBatch(Func() .AddSingleton(CreateStorageAccessProviderFromConfiguration) .AddSingleton(sp => ActivatorUtilities.CreateInstance(sp, (IAzureProxy)sp.GetRequiredService(typeof(AzureProxy)))) .AddSingleton>(sp => ActivatorUtilities.CreateInstance>(sp, (IRepository)sp.GetRequiredService(typeof(TesTaskPostgreSqlRepository)))) From bf2edadda8977cee74e12b451c2115413d7d9cfa Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 28 Sep 2023 18:31:43 -0700 Subject: [PATCH 007/202] Implement finding completed tasks via batch --- src/TesApi.Tests/BatchPoolTests.cs | 4 +- src/TesApi.Tests/BatchSchedulerTests.cs | 720 +++++++++--------- src/TesApi.Web/AzureBatchJobAndTaskState.cs | 117 --- src/TesApi.Web/AzureProxy.cs | 272 +++---- src/TesApi.Web/BatchPool.cs | 53 +- src/TesApi.Web/BatchPoolService.cs | 126 ++- src/TesApi.Web/BatchScheduler.cs | 672 ++++++++-------- .../CachingWithRetriesAzureProxy.cs | 3 - src/TesApi.Web/IAzureProxy.cs | 8 +- src/TesApi.Web/IBatchPool.cs | 8 + src/TesApi.Web/IBatchScheduler.cs | 4 +- .../OrchestrateOnBatchSchedulerService.cs | 272 +++++++ src/TesApi.Web/Scheduler.cs | 260 +------ 13 files changed, 1297 insertions(+), 1222 deletions(-) delete mode 100644 src/TesApi.Web/AzureBatchJobAndTaskState.cs create mode 100644 src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs diff --git a/src/TesApi.Tests/BatchPoolTests.cs b/src/TesApi.Tests/BatchPoolTests.cs index d88df1dea..bd455df3f 100644 --- a/src/TesApi.Tests/BatchPoolTests.cs +++ b/src/TesApi.Tests/BatchPoolTests.cs @@ -380,7 +380,7 @@ static Microsoft.Azure.Batch.Protocol.Models.MetadataItem ConvertMetadata(Micros => item is null ? default : new(item.Name, item.Value); } - internal static CloudTask GenerateTask(string jobId, string id, DateTime stateTransitionTime = default) + internal static CloudTask GenerateTask(string jobId, string id, DateTime stateTransitionTime = default, Microsoft.Azure.Batch.Protocol.Models.TaskExecutionInformation executionInfo = default) { if (default == stateTransitionTime) { @@ -393,7 +393,7 @@ internal static CloudTask GenerateTask(string jobId, string id, DateTime stateTr .Invoke(new object[] { batchServiceClient }); var parentClient = (BatchClient)typeof(BatchClient).GetConstructor(System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic, null, new Type[] { typeof(Microsoft.Azure.Batch.Protocol.BatchServiceClient).Assembly.GetType("Microsoft.Azure.Batch.IProtocolLayer") }, null) .Invoke(new object[] { protocolLayer }); - var modelTask = new Microsoft.Azure.Batch.Protocol.Models.CloudTask(id: id, stateTransitionTime: stateTransitionTime, state: Microsoft.Azure.Batch.Protocol.Models.TaskState.Active); + var modelTask = new Microsoft.Azure.Batch.Protocol.Models.CloudTask(id: id, stateTransitionTime: stateTransitionTime, executionInfo: executionInfo, state: Microsoft.Azure.Batch.Protocol.Models.TaskState.Active); var task = (CloudTask)typeof(CloudTask).GetConstructor(System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance, default, new Type[] { typeof(BatchClient), typeof(string), typeof(Microsoft.Azure.Batch.Protocol.Models.CloudTask), typeof(IEnumerable) }, default) .Invoke(new object[] { parentClient, jobId, modelTask, Enumerable.Empty() }); return task; diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index ab1844b6a..154b3cc14 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -267,21 +267,21 @@ public async Task TesTaskFailsWithSystemErrorWhenTotalBatchQuotaIsSetTooLow() Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(new TesResources { CpuCores = 2, RamGb = 1, Preemptible = false }, azureProxyReturnValues)); } - [TestMethod] - public async Task TesTaskFailsWhenBatchNodeDiskIsFull() - { - var tesTask = GetTesTask(); - - (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask, BatchJobAndTaskStates.NodeDiskFull); - - GuardAssertsWithTesTask(tesTask, () => - { - Assert.AreEqual(TesState.EXECUTORERROREnum, tesTask.State); - Assert.AreEqual("DiskFull", failureReason); - Assert.AreEqual("DiskFull", systemLog[0]); - Assert.AreEqual("DiskFull", tesTask.FailureReason); - }); - } + //[TestMethod] + //public async Task TesTaskFailsWhenBatchNodeDiskIsFull() + //{ + // var tesTask = GetTesTask(); + + // (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask, BatchJobAndTaskStates.NodeDiskFull); + + // GuardAssertsWithTesTask(tesTask, () => + // { + // Assert.AreEqual(TesState.EXECUTORERROREnum, tesTask.State); + // Assert.AreEqual("DiskFull", failureReason); + // Assert.AreEqual("DiskFull", systemLog[0]); + // Assert.AreEqual("DiskFull", tesTask.FailureReason); + // }); + //} //TODO: This test (and potentially others) must be reviewed and see if they are necessary considering that the quota verification logic is its own class. // There are a couple of issues: a similar validation already exists in the quota verifier class, and in order to run this test a complex set up is required, which is hard to maintain. @@ -768,312 +768,312 @@ static async Task RunTest(string allowedVmSizes, TesState expectedTaskState, str await RunTest("VmFamily2", TesState.INITIALIZINGEnum, "VmSizeLowPri2"); } - [TestMethod] - public async Task TaskStateTransitionsFromRunningState() - { - Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskActive)); - Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskPreparing)); - Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskRunning)); - Assert.AreEqual(TesState.COMPLETEEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskCompletedSuccessfully)); - Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskFailed)); - Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.JobNotFound)); - Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskNotFound)); - Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.MoreThanOneJobFound)); - Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.NodeDiskFull)); - Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.ActiveJobWithMissingAutoPool)); - Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.NodePreempted)); - } - - [TestMethod] - public async Task TaskStateTransitionsFromInitializingState() - { - Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskActive)); - Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskPreparing)); - Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskRunning)); - Assert.AreEqual(TesState.COMPLETEEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskCompletedSuccessfully)); - Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskFailed)); - Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.JobNotFound)); - Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskNotFound)); - Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.MoreThanOneJobFound)); - Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.NodeDiskFull)); - Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.NodeAllocationFailed)); - Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.ImageDownloadFailed)); - Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.ActiveJobWithMissingAutoPool)); - Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.NodePreempted)); - } - - [TestMethod] - public async Task TaskStateTransitionsFromQueuedState() - { - Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskActive)); - Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskPreparing)); - Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskRunning)); - Assert.AreEqual(TesState.COMPLETEEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskCompletedSuccessfully)); - Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskFailed)); - Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.MoreThanOneJobFound)); - Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.NodeDiskFull)); - Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskNotFound)); - } - - [TestMethod] - public async Task TaskIsRequeuedUpToThreeTimesForTransientErrors() - { - var tesTask = GetTesTask(); - - var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - - azureProxyReturnValues.VmSizesAndPrices = new() { - new() { VmSize = "VmSize1", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 1 }, - new() { VmSize = "VmSize2", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 2 }, - new() { VmSize = "VmSize3", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 3 }, - new() { VmSize = "VmSize4", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 4 }, - new() { VmSize = "VmSize5", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 5 } - }; - - await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); - await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed))); - await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); - await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed))); - await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); - await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed))); - await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); - await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed))); - } - - [TestMethod] - public async Task TaskThatFailsWithNodeAllocationErrorIsRequeuedOnDifferentVmSize() - { - var tesTask = GetTesTask(); - - await GetNewTesTaskStateAsync(tesTask); - await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed); - var firstAttemptVmSize = tesTask.Logs[0].VirtualMachineInfo.VmSize; - - await GetNewTesTaskStateAsync(tesTask); - await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed); - var secondAttemptVmSize = tesTask.Logs[1].VirtualMachineInfo.VmSize; - - GuardAssertsWithTesTask(tesTask, () => Assert.AreNotEqual(firstAttemptVmSize, secondAttemptVmSize)); - - // There are only two suitable VMs, and both have been excluded because of the NodeAllocationFailed error on the two earlier attempts - _ = await GetNewTesTaskStateAsync(tesTask); - - GuardAssertsWithTesTask(tesTask, () => - { - Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); - Assert.AreEqual("NoVmSizeAvailable", tesTask.FailureReason); - }); - } - - [TestMethod] - public async Task TaskGetsCancelled() - { - var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELINGEnum }; - - var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - azureProxyReturnValues.BatchJobAndTaskState = BatchJobAndTaskStates.TaskActive; - Mock azureProxy = default; - var azureProxySetter = new Action>(mock => - { - GetMockAzureProxy(azureProxyReturnValues)(mock); - azureProxy = mock; - }); - - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, azureProxyReturnValues); - - GuardAssertsWithTesTask(tesTask, () => - { - Assert.AreEqual(TesState.CANCELEDEnum, tesTask.State); - Assert.IsTrue(tesTask.IsTaskDeletionRequired); - azureProxy.Verify(i => i.TerminateBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); - }); - } - - [TestMethod] - public async Task CancelledTaskGetsDeleted() - { - var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELEDEnum, IsTaskDeletionRequired = true, Logs = new() { new() { StartTime = DateTimeOffset.UtcNow - TimeSpan.FromMinutes(11) } } }; - - var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - azureProxyReturnValues.BatchJobAndTaskState = BatchJobAndTaskStates.TaskActive; - Mock azureProxy = default; - var azureProxySetter = new Action>(mock => - { - GetMockAzureProxy(azureProxyReturnValues)(mock); - azureProxy = mock; - }); - - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, azureProxyReturnValues); - - GuardAssertsWithTesTask(tesTask, () => - { - Assert.IsFalse(tesTask.IsTaskDeletionRequired); - azureProxy.Verify(i => i.DeleteBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); - }); - } - - [TestMethod] - public async Task SuccessfullyCompletedTaskContainsBatchNodeMetrics() - { - var tesTask = GetTesTask(); - - var metricsFileContent = @" - BlobXferPullStart=2020-10-08T02:30:39+00:00 - BlobXferPullEnd=2020-10-08T02:31:39+00:00 - ExecutorPullStart=2020-10-08T02:32:39+00:00 - ExecutorImageSizeInBytes=3000000000 - ExecutorPullEnd=2020-10-08T02:34:39+00:00 - DownloadStart=2020-10-08T02:35:39+00:00 - FileDownloadSizeInBytes=2000000000 - DownloadEnd=2020-10-08T02:38:39+00:00 - ExecutorStart=2020-10-08T02:39:39+00:00 - ExecutorEnd=2020-10-08T02:43:39+00:00 - UploadStart=2020-10-08T02:44:39+00:00 - FileUploadSizeInBytes=4000000000 - UploadEnd=2020-10-08T02:49:39+00:00 - DiskSizeInKiB=8000000 - DiskUsedInKiB=1000000".Replace(" ", string.Empty); - - var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - azureProxyReturnValues.BatchJobAndTaskState = BatchJobAndTaskStates.TaskCompletedSuccessfully; - azureProxyReturnValues.DownloadedBlobContent = metricsFileContent; - - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); - - GuardAssertsWithTesTask(tesTask, () => - { - Assert.AreEqual(TesState.COMPLETEEnum, tesTask.State); - - var batchNodeMetrics = tesTask.GetOrAddTesTaskLog().BatchNodeMetrics; - Assert.IsNotNull(batchNodeMetrics); - Assert.AreEqual(60, batchNodeMetrics.BlobXferImagePullDurationInSeconds); - Assert.AreEqual(120, batchNodeMetrics.ExecutorImagePullDurationInSeconds); - Assert.AreEqual(3, batchNodeMetrics.ExecutorImageSizeInGB); - Assert.AreEqual(180, batchNodeMetrics.FileDownloadDurationInSeconds); - Assert.AreEqual(240, batchNodeMetrics.ExecutorDurationInSeconds); - Assert.AreEqual(300, batchNodeMetrics.FileUploadDurationInSeconds); - Assert.AreEqual(1.024, batchNodeMetrics.DiskUsedInGB); - Assert.AreEqual(12.5f, batchNodeMetrics.DiskUsedPercent); - Assert.AreEqual(2, batchNodeMetrics.FileDownloadSizeInGB); - Assert.AreEqual(4, batchNodeMetrics.FileUploadSizeInGB); - - var executorLog = tesTask.GetOrAddTesTaskLog().GetOrAddExecutorLog(); - Assert.IsNotNull(executorLog); - Assert.AreEqual(0, executorLog.ExitCode); - Assert.AreEqual(DateTimeOffset.Parse("2020-10-08T02:30:39+00:00"), executorLog.StartTime); - Assert.AreEqual(DateTimeOffset.Parse("2020-10-08T02:49:39+00:00"), executorLog.EndTime); - }); - } - - [TestMethod] - public async Task SuccessfullyCompletedTaskContainsCromwellResultCode() - { - var tesTask = GetTesTask(); - - var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - azureProxyReturnValues.BatchJobAndTaskState = BatchJobAndTaskStates.TaskCompletedSuccessfully; - azureProxyReturnValues.DownloadedBlobContent = "2"; - var azureProxy = GetMockAzureProxy(azureProxyReturnValues); - - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxy, azureProxyReturnValues); - - GuardAssertsWithTesTask(tesTask, () => - { - Assert.AreEqual(TesState.COMPLETEEnum, tesTask.State); - Assert.AreEqual(2, tesTask.GetOrAddTesTaskLog().CromwellResultCode); - Assert.AreEqual(2, tesTask.CromwellResultCode); - }); - } - - [TestMethod] - public async Task TesInputFilePathMustStartWithCromwellExecutions() - { - var tesTask = GetTesTask(); - - tesTask.Inputs.Add(new() - { - Path = "xyz/path" - }); - - (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask); - - GuardAssertsWithTesTask(tesTask, () => - { - Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); - Assert.AreEqual($"InvalidInputFilePath", failureReason); - Assert.AreEqual($"InvalidInputFilePath", systemLog[0]); - Assert.AreEqual($"Unsupported input path 'xyz/path' for task Id {tesTask.Id}. Must start with '/'.", systemLog[1]); - }); - } - - [TestMethod] - public async Task TesInputFileMustHaveEitherUrlOrContent() - { - var tesTask = GetTesTask(); - - tesTask.Inputs.Add(new() - { - Url = null, - Content = null, - Path = "/file1.txt" - }); - - (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask); - - GuardAssertsWithTesTask(tesTask, () => - { - Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); - Assert.AreEqual($"InvalidInputFilePath", failureReason); - Assert.AreEqual($"InvalidInputFilePath", systemLog[0]); - Assert.AreEqual($"One of Input Url or Content must be set", systemLog[1]); - }); - } - - [TestMethod] - public async Task TesInputFileMustNotHaveBothUrlAndContent() - { - var tesTask = GetTesTask(); - - tesTask.Inputs.Add(new() - { - Url = "/storageaccount1/container1/file1.txt", - Content = "test content", - Path = "/file1.txt" - }); - - (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask); - - GuardAssertsWithTesTask(tesTask, () => - { - Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); - Assert.AreEqual($"InvalidInputFilePath", failureReason); - Assert.AreEqual($"InvalidInputFilePath", systemLog[0]); - Assert.AreEqual($"Input Url and Content cannot be both set", systemLog[1]); - }); - } - - [TestMethod] - public async Task TesInputFileTypeMustNotBeDirectory() - { - var tesTask = GetTesTask(); - - tesTask.Inputs.Add(new() - { - Url = "/storageaccount1/container1/directory", - Type = TesFileType.DIRECTORYEnum, - Path = "/directory" - }); - - (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask); - - GuardAssertsWithTesTask(tesTask, () => - { - Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); - Assert.AreEqual($"InvalidInputFilePath", failureReason); - Assert.AreEqual($"InvalidInputFilePath", systemLog[0]); - Assert.AreEqual($"Directory input is not supported.", systemLog[1]); - }); - } + //[TestMethod] + //public async Task TaskStateTransitionsFromRunningState() + //{ + // Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskActive)); + // Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskPreparing)); + // Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskRunning)); + // Assert.AreEqual(TesState.COMPLETEEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskCompletedSuccessfully)); + // Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskFailed)); + // Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.JobNotFound)); + // Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskNotFound)); + // Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.MoreThanOneJobFound)); + // Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.NodeDiskFull)); + // Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.ActiveJobWithMissingAutoPool)); + // Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.NodePreempted)); + //} + + //[TestMethod] + //public async Task TaskStateTransitionsFromInitializingState() + //{ + // Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskActive)); + // Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskPreparing)); + // Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskRunning)); + // Assert.AreEqual(TesState.COMPLETEEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskCompletedSuccessfully)); + // Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskFailed)); + // Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.JobNotFound)); + // Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskNotFound)); + // Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.MoreThanOneJobFound)); + // Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.NodeDiskFull)); + // Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.NodeAllocationFailed)); + // Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.ImageDownloadFailed)); + // Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.ActiveJobWithMissingAutoPool)); + // Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.NodePreempted)); + //} + + //[TestMethod] + //public async Task TaskStateTransitionsFromQueuedState() + //{ + // Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskActive)); + // Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskPreparing)); + // Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskRunning)); + // Assert.AreEqual(TesState.COMPLETEEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskCompletedSuccessfully)); + // Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskFailed)); + // Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.MoreThanOneJobFound)); + // Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.NodeDiskFull)); + // Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskNotFound)); + //} + + //[TestMethod] + //public async Task TaskIsRequeuedUpToThreeTimesForTransientErrors() + //{ + // var tesTask = GetTesTask(); + + // var azureProxyReturnValues = AzureProxyReturnValues.Defaults; + + // azureProxyReturnValues.VmSizesAndPrices = new() { + // new() { VmSize = "VmSize1", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 1 }, + // new() { VmSize = "VmSize2", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 2 }, + // new() { VmSize = "VmSize3", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 3 }, + // new() { VmSize = "VmSize4", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 4 }, + // new() { VmSize = "VmSize5", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 5 } + // }; + + // await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); + // await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed))); + // await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); + // await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed))); + // await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); + // await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed))); + // await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); + // await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed))); + //} + + //[TestMethod] + //public async Task TaskThatFailsWithNodeAllocationErrorIsRequeuedOnDifferentVmSize() + //{ + // var tesTask = GetTesTask(); + + // await GetNewTesTaskStateAsync(tesTask); + // await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed); + // var firstAttemptVmSize = tesTask.Logs[0].VirtualMachineInfo.VmSize; + + // await GetNewTesTaskStateAsync(tesTask); + // await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed); + // var secondAttemptVmSize = tesTask.Logs[1].VirtualMachineInfo.VmSize; + + // GuardAssertsWithTesTask(tesTask, () => Assert.AreNotEqual(firstAttemptVmSize, secondAttemptVmSize)); + + // // There are only two suitable VMs, and both have been excluded because of the NodeAllocationFailed error on the two earlier attempts + // _ = await GetNewTesTaskStateAsync(tesTask); + + // GuardAssertsWithTesTask(tesTask, () => + // { + // Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); + // Assert.AreEqual("NoVmSizeAvailable", tesTask.FailureReason); + // }); + //} + + //[TestMethod] + //public async Task TaskGetsCancelled() + //{ + // var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELINGEnum }; + + // var azureProxyReturnValues = AzureProxyReturnValues.Defaults; + // azureProxyReturnValues.BatchJobAndTaskState = BatchJobAndTaskStates.TaskActive; + // Mock azureProxy = default; + // var azureProxySetter = new Action>(mock => + // { + // GetMockAzureProxy(azureProxyReturnValues)(mock); + // azureProxy = mock; + // }); + + // _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, azureProxyReturnValues); + + // GuardAssertsWithTesTask(tesTask, () => + // { + // Assert.AreEqual(TesState.CANCELEDEnum, tesTask.State); + // Assert.IsTrue(tesTask.IsTaskDeletionRequired); + // azureProxy.Verify(i => i.TerminateBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); + // }); + //} + + //[TestMethod] + //public async Task CancelledTaskGetsDeleted() + //{ + // var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELEDEnum, IsTaskDeletionRequired = true, Logs = new() { new() { StartTime = DateTimeOffset.UtcNow - TimeSpan.FromMinutes(11) } } }; + + // var azureProxyReturnValues = AzureProxyReturnValues.Defaults; + // azureProxyReturnValues.BatchJobAndTaskState = BatchJobAndTaskStates.TaskActive; + // Mock azureProxy = default; + // var azureProxySetter = new Action>(mock => + // { + // GetMockAzureProxy(azureProxyReturnValues)(mock); + // azureProxy = mock; + // }); + + // _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, azureProxyReturnValues); + + // GuardAssertsWithTesTask(tesTask, () => + // { + // Assert.IsFalse(tesTask.IsTaskDeletionRequired); + // azureProxy.Verify(i => i.DeleteBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); + // }); + //} + + //[TestMethod] + //public async Task SuccessfullyCompletedTaskContainsBatchNodeMetrics() + //{ + // var tesTask = GetTesTask(); + + // var metricsFileContent = @" + // BlobXferPullStart=2020-10-08T02:30:39+00:00 + // BlobXferPullEnd=2020-10-08T02:31:39+00:00 + // ExecutorPullStart=2020-10-08T02:32:39+00:00 + // ExecutorImageSizeInBytes=3000000000 + // ExecutorPullEnd=2020-10-08T02:34:39+00:00 + // DownloadStart=2020-10-08T02:35:39+00:00 + // FileDownloadSizeInBytes=2000000000 + // DownloadEnd=2020-10-08T02:38:39+00:00 + // ExecutorStart=2020-10-08T02:39:39+00:00 + // ExecutorEnd=2020-10-08T02:43:39+00:00 + // UploadStart=2020-10-08T02:44:39+00:00 + // FileUploadSizeInBytes=4000000000 + // UploadEnd=2020-10-08T02:49:39+00:00 + // DiskSizeInKiB=8000000 + // DiskUsedInKiB=1000000".Replace(" ", string.Empty); + + // var azureProxyReturnValues = AzureProxyReturnValues.Defaults; + // azureProxyReturnValues.BatchJobAndTaskState = BatchJobAndTaskStates.TaskCompletedSuccessfully; + // azureProxyReturnValues.DownloadedBlobContent = metricsFileContent; + + // _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); + + // GuardAssertsWithTesTask(tesTask, () => + // { + // Assert.AreEqual(TesState.COMPLETEEnum, tesTask.State); + + // var batchNodeMetrics = tesTask.GetOrAddTesTaskLog().BatchNodeMetrics; + // Assert.IsNotNull(batchNodeMetrics); + // Assert.AreEqual(60, batchNodeMetrics.BlobXferImagePullDurationInSeconds); + // Assert.AreEqual(120, batchNodeMetrics.ExecutorImagePullDurationInSeconds); + // Assert.AreEqual(3, batchNodeMetrics.ExecutorImageSizeInGB); + // Assert.AreEqual(180, batchNodeMetrics.FileDownloadDurationInSeconds); + // Assert.AreEqual(240, batchNodeMetrics.ExecutorDurationInSeconds); + // Assert.AreEqual(300, batchNodeMetrics.FileUploadDurationInSeconds); + // Assert.AreEqual(1.024, batchNodeMetrics.DiskUsedInGB); + // Assert.AreEqual(12.5f, batchNodeMetrics.DiskUsedPercent); + // Assert.AreEqual(2, batchNodeMetrics.FileDownloadSizeInGB); + // Assert.AreEqual(4, batchNodeMetrics.FileUploadSizeInGB); + + // var executorLog = tesTask.GetOrAddTesTaskLog().GetOrAddExecutorLog(); + // Assert.IsNotNull(executorLog); + // Assert.AreEqual(0, executorLog.ExitCode); + // Assert.AreEqual(DateTimeOffset.Parse("2020-10-08T02:30:39+00:00"), executorLog.StartTime); + // Assert.AreEqual(DateTimeOffset.Parse("2020-10-08T02:49:39+00:00"), executorLog.EndTime); + // }); + //} + + //[TestMethod] + //public async Task SuccessfullyCompletedTaskContainsCromwellResultCode() + //{ + // var tesTask = GetTesTask(); + + // var azureProxyReturnValues = AzureProxyReturnValues.Defaults; + // azureProxyReturnValues.BatchJobAndTaskState = BatchJobAndTaskStates.TaskCompletedSuccessfully; + // azureProxyReturnValues.DownloadedBlobContent = "2"; + // var azureProxy = GetMockAzureProxy(azureProxyReturnValues); + + // _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxy, azureProxyReturnValues); + + // GuardAssertsWithTesTask(tesTask, () => + // { + // Assert.AreEqual(TesState.COMPLETEEnum, tesTask.State); + // Assert.AreEqual(2, tesTask.GetOrAddTesTaskLog().CromwellResultCode); + // Assert.AreEqual(2, tesTask.CromwellResultCode); + // }); + //} + + //[TestMethod] + //public async Task TesInputFilePathMustStartWithCromwellExecutions() + //{ + // var tesTask = GetTesTask(); + + // tesTask.Inputs.Add(new() + // { + // Path = "xyz/path" + // }); + + // (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask); + + // GuardAssertsWithTesTask(tesTask, () => + // { + // Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); + // Assert.AreEqual($"InvalidInputFilePath", failureReason); + // Assert.AreEqual($"InvalidInputFilePath", systemLog[0]); + // Assert.AreEqual($"Unsupported input path 'xyz/path' for task Id {tesTask.Id}. Must start with '/'.", systemLog[1]); + // }); + //} + + //[TestMethod] + //public async Task TesInputFileMustHaveEitherUrlOrContent() + //{ + // var tesTask = GetTesTask(); + + // tesTask.Inputs.Add(new() + // { + // Url = null, + // Content = null, + // Path = "/file1.txt" + // }); + + // (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask); + + // GuardAssertsWithTesTask(tesTask, () => + // { + // Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); + // Assert.AreEqual($"InvalidInputFilePath", failureReason); + // Assert.AreEqual($"InvalidInputFilePath", systemLog[0]); + // Assert.AreEqual($"One of Input Url or Content must be set", systemLog[1]); + // }); + //} + + //[TestMethod] + //public async Task TesInputFileMustNotHaveBothUrlAndContent() + //{ + // var tesTask = GetTesTask(); + + // tesTask.Inputs.Add(new() + // { + // Url = "/storageaccount1/container1/file1.txt", + // Content = "test content", + // Path = "/file1.txt" + // }); + + // (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask); + + // GuardAssertsWithTesTask(tesTask, () => + // { + // Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); + // Assert.AreEqual($"InvalidInputFilePath", failureReason); + // Assert.AreEqual($"InvalidInputFilePath", systemLog[0]); + // Assert.AreEqual($"Input Url and Content cannot be both set", systemLog[1]); + // }); + //} + + //[TestMethod] + //public async Task TesInputFileTypeMustNotBeDirectory() + //{ + // var tesTask = GetTesTask(); + + // tesTask.Inputs.Add(new() + // { + // Url = "/storageaccount1/container1/directory", + // Type = TesFileType.DIRECTORYEnum, + // Path = "/directory" + // }); + + // (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask); + + // GuardAssertsWithTesTask(tesTask, () => + // { + // Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); + // Assert.AreEqual($"InvalidInputFilePath", failureReason); + // Assert.AreEqual($"InvalidInputFilePath", systemLog[0]); + // Assert.AreEqual($"Directory input is not supported.", systemLog[1]); + // }); + //} [TestMethod] public async Task QueryStringsAreRemovedFromLocalFilePathsWhenCommandScriptIsProvidedAsFile() @@ -1506,15 +1506,15 @@ public async Task Assert.AreEqual(expectedPath, destination.Container.Path); } - private static async Task<(string FailureReason, string[] SystemLog)> ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(TesTask tesTask, AzureBatchJobAndTaskState? azureBatchJobAndTaskState = null) - { - var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - azureProxyReturnValues.BatchJobAndTaskState = azureBatchJobAndTaskState ?? azureProxyReturnValues.BatchJobAndTaskState; + //private static async Task<(string FailureReason, string[] SystemLog)> ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(TesTask tesTask, AzureBatchJobAndTaskState? azureBatchJobAndTaskState = null) + //{ + // var azureProxyReturnValues = AzureProxyReturnValues.Defaults; + // azureProxyReturnValues.BatchJobAndTaskState = azureBatchJobAndTaskState ?? azureProxyReturnValues.BatchJobAndTaskState; - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); + // _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); - return (tesTask.Logs?.LastOrDefault()?.FailureReason, tesTask.Logs?.LastOrDefault()?.SystemLogs?.ToArray()); - } + // return (tesTask.Logs?.LastOrDefault()?.FailureReason, tesTask.Logs?.LastOrDefault()?.SystemLogs?.ToArray()); + //} private static Task<(string JobId, CloudTask CloudTask, PoolInformation PoolInformation, Pool batchModelsPool)> ProcessTesTaskAndGetBatchJobArgumentsAsync() => ProcessTesTaskAndGetBatchJobArgumentsAsync(GetTesTask(), GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); @@ -1532,11 +1532,25 @@ public async Task var batchScheduler = serviceProvider.GetT(); serviceProviderActions?.Invoke(serviceProvider); + //BatchJobAndTaskStates.TaskActive + //BatchJobAndTaskStates.TaskPreparing + //BatchJobAndTaskStates.TaskRunning + //BatchJobAndTaskStates.TaskCompletedSuccessfully + //BatchJobAndTaskStates.TaskFailed + //BatchJobAndTaskStates.JobNotFound + //BatchJobAndTaskStates.TaskNotFound + //BatchJobAndTaskStates.MoreThanOneJobFound + //BatchJobAndTaskStates.NodeAllocationFailed + //BatchJobAndTaskStates.NodePreempted + //BatchJobAndTaskStates.NodeDiskFull + //BatchJobAndTaskStates.ActiveJobWithMissingAutoPool + //BatchJobAndTaskStates.ImageDownloadFailed + await foreach (var _ in tesTask.State switch { TesState.CANCELINGEnum => batchScheduler.ProcessCancelledTesTasksAsync(new[] { tesTask }, System.Threading.CancellationToken.None), TesState.QUEUEDEnum => batchScheduler.ProcessQueuedTesTasksAsync(new[] { tesTask }, System.Threading.CancellationToken.None), - TesState.COMPLETEEnum => batchScheduler.ProcessCompletedTesTasksAsync(new[] { tesTask }, new[] { new CloudTask(tesTask.Id, "") }, System.Threading.CancellationToken.None), + TesState.COMPLETEEnum => batchScheduler.ProcessCompletedTesTasksAsync(new[] { tesTask }, new[] { BatchPoolTests.GenerateTask(tesTask.PoolId, tesTask.Id, executionInfo: new(0, 0, startTime: DateTime.UtcNow - TimeSpan.FromMinutes(15), endTime: DateTime.UtcNow - TimeSpan.FromMinutes(5), exitCode: 0)) }, System.Threading.CancellationToken.None), TesState.EXECUTORERROREnum => batchScheduler.ProcessCompletedTesTasksAsync(new[] { tesTask }, new[] { new CloudTask(tesTask.Id, "") }, System.Threading.CancellationToken.None), TesState.SYSTEMERROREnum => batchScheduler.ProcessCompletedTesTasksAsync(new[] { tesTask }, new[] { new CloudTask(tesTask.Id, "") }, System.Threading.CancellationToken.None), _ => batchScheduler.ProcessTerminatedTesTasksAsync(new[] { tesTask }, System.Threading.CancellationToken.None), @@ -1619,13 +1633,13 @@ private static async Task GetNewTesTaskStateAsync(TesTask tesTask, Azu return tesTask.State; } - private static Task GetNewTesTaskStateAsync(TesState currentTesTaskState, AzureBatchJobAndTaskState azureBatchJobAndTaskState) - => GetNewTesTaskStateAsync(new TesTask { Id = "test", State = currentTesTaskState, Executors = new() { new() { Image = "imageName1", Command = new() { "command" } } } }, azureBatchJobAndTaskState); + private static Task GetNewTesTaskStateAsync(TesState currentTesTaskState/*, AzureBatchJobAndTaskState azureBatchJobAndTaskState*/) + => GetNewTesTaskStateAsync(new TesTask { Id = "test", State = currentTesTaskState, Executors = new() { new() { Image = "imageName1", Command = new() { "command" } } } }/*, azureBatchJobAndTaskState*/); - private static Task GetNewTesTaskStateAsync(TesTask tesTask, AzureBatchJobAndTaskState? azureBatchJobAndTaskState = null) + private static Task GetNewTesTaskStateAsync(TesTask tesTask/*, AzureBatchJobAndTaskState? azureBatchJobAndTaskState = null*/) { var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - azureProxyReturnValues.BatchJobAndTaskState = azureBatchJobAndTaskState ?? azureProxyReturnValues.BatchJobAndTaskState; + //azureProxyReturnValues.BatchJobAndTaskState = azureBatchJobAndTaskState ?? azureProxyReturnValues.BatchJobAndTaskState; return GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); } @@ -1660,8 +1674,8 @@ private static Action> GetMockAzureProxy(AzureProxyReturnValue azureProxy.Setup(a => a.GetActivePoolsAsync(It.IsAny())) .Returns(AsyncEnumerable.Empty()); - azureProxy.Setup(a => a.GetBatchJobAndTaskStateAsync(It.IsAny(), It.IsAny())) - .Returns(Task.FromResult(azureProxyReturnValues.BatchJobAndTaskState)); + //azureProxy.Setup(a => a.GetBatchJobAndTaskStateAsync(It.IsAny(), It.IsAny())) + // .Returns(Task.FromResult(azureProxyReturnValues.BatchJobAndTaskState)); azureProxy.Setup(a => a.GetStorageAccountInfoAsync("defaultstorageaccount", It.IsAny())) .Returns(Task.FromResult(azureProxyReturnValues.StorageAccountInfos["defaultstorageaccount"])); @@ -1815,22 +1829,22 @@ internal static async ValueTask GuardAssertsWithTesTask(TesTask tesTask, Func new() { JobState = JobState.Active, TaskState = TaskState.Active }; - public static AzureBatchJobAndTaskState TaskPreparing => new() { JobState = JobState.Active, TaskState = TaskState.Preparing }; - public static AzureBatchJobAndTaskState TaskRunning => new() { JobState = JobState.Active, TaskState = TaskState.Running }; - public static AzureBatchJobAndTaskState TaskCompletedSuccessfully => new() { JobState = JobState.Completed, TaskState = TaskState.Completed, TaskExitCode = 0 }; - public static AzureBatchJobAndTaskState TaskFailed => new() { JobState = JobState.Completed, TaskState = TaskState.Completed, TaskExitCode = -1 }; - public static AzureBatchJobAndTaskState JobNotFound => new() { JobState = null }; - public static AzureBatchJobAndTaskState TaskNotFound => new() { JobState = JobState.Active, TaskState = null }; - public static AzureBatchJobAndTaskState MoreThanOneJobFound => new() { MoreThanOneActiveJobOrTaskFound = true }; - public static AzureBatchJobAndTaskState NodeAllocationFailed => new() { JobState = JobState.Active, NodeAllocationFailed = true }; - public static AzureBatchJobAndTaskState NodePreempted => new() { JobState = JobState.Active, NodeState = ComputeNodeState.Preempted }; - public static AzureBatchJobAndTaskState NodeDiskFull => new() { JobState = JobState.Active, NodeErrorCode = "DiskFull" }; - public static AzureBatchJobAndTaskState ActiveJobWithMissingAutoPool => new() { ActiveJobWithMissingAutoPool = true }; - public static AzureBatchJobAndTaskState ImageDownloadFailed => new() { JobState = JobState.Active, NodeErrorCode = "ContainerInvalidImage" }; - } + //private struct BatchJobAndTaskStates + //{ + // public static AzureBatchJobAndTaskState TaskActive => new() { JobState = JobState.Active, TaskState = TaskState.Active }; + // public static AzureBatchJobAndTaskState TaskPreparing => new() { JobState = JobState.Active, TaskState = TaskState.Preparing }; + // public static AzureBatchJobAndTaskState TaskRunning => new() { JobState = JobState.Active, TaskState = TaskState.Running }; + // public static AzureBatchJobAndTaskState TaskCompletedSuccessfully => new() { JobState = JobState.Completed, TaskState = TaskState.Completed, TaskExitCode = 0 }; + // public static AzureBatchJobAndTaskState TaskFailed => new() { JobState = JobState.Completed, TaskState = TaskState.Completed, TaskExitCode = -1 }; + // public static AzureBatchJobAndTaskState JobNotFound => new() { JobState = null }; + // public static AzureBatchJobAndTaskState TaskNotFound => new() { JobState = JobState.Active, TaskState = null }; + // public static AzureBatchJobAndTaskState MoreThanOneJobFound => new() { MoreThanOneActiveJobOrTaskFound = true }; + // public static AzureBatchJobAndTaskState NodeAllocationFailed => new() { JobState = JobState.Active, NodeAllocationFailed = true }; + // public static AzureBatchJobAndTaskState NodePreempted => new() { JobState = JobState.Active, NodeState = ComputeNodeState.Preempted }; + // public static AzureBatchJobAndTaskState NodeDiskFull => new() { JobState = JobState.Active, NodeErrorCode = "DiskFull" }; + // public static AzureBatchJobAndTaskState ActiveJobWithMissingAutoPool => new() { ActiveJobWithMissingAutoPool = true }; + // public static AzureBatchJobAndTaskState ImageDownloadFailed => new() { JobState = JobState.Active, NodeErrorCode = "ContainerInvalidImage" }; + //} private class AzureProxyReturnValues { @@ -1845,7 +1859,7 @@ private class AzureProxyReturnValues public IEnumerable ActiveNodeCountByVmSize { get; set; } public int ActiveJobCount { get; set; } public int ActivePoolCount { get; set; } - public AzureBatchJobAndTaskState BatchJobAndTaskState { get; set; } + //public AzureBatchJobAndTaskState BatchJobAndTaskState { get; set; } public string StorageAccountKey { get; set; } public string DownloadedBlobContent { get; set; } public bool LocalFileExists { get; set; } @@ -1870,7 +1884,7 @@ private class AzureProxyReturnValues ActiveNodeCountByVmSize = new List(), ActiveJobCount = 0, ActivePoolCount = 0, - BatchJobAndTaskState = BatchJobAndTaskStates.JobNotFound, + //BatchJobAndTaskState = BatchJobAndTaskStates.JobNotFound, StorageAccountKey = "Key1", DownloadedBlobContent = string.Empty, LocalFileExists = true diff --git a/src/TesApi.Web/AzureBatchJobAndTaskState.cs b/src/TesApi.Web/AzureBatchJobAndTaskState.cs deleted file mode 100644 index b9e6cf734..000000000 --- a/src/TesApi.Web/AzureBatchJobAndTaskState.cs +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System; -using System.Collections.Generic; -using Microsoft.Azure.Batch; -using Microsoft.Azure.Batch.Common; - -namespace TesApi.Web -{ - /// - /// Gets the combined state of Azure Batch job, task and pool that corresponds to the given TES task - /// - public struct AzureBatchJobAndTaskState - { - /// - /// More than one active Azure Batch job and/or task was found in the active state for this task. No other members are set. - /// - public bool MoreThanOneActiveJobOrTaskFound { get; set; } - /// - /// Job is active but has not had a pool assigned within a configured time limit. - /// - public bool ActiveJobWithMissingAutoPool { get; set; } - /// - /// Attempt number for this task. The other members are from this attempt. - /// - public int AttemptNumber { get; set; } // TODO: consider removing - /// - /// The Batch service encountered an error while resizing - /// the pool or the pool's Microsoft.Azure.Batch.CloudPool.AllocationState - /// was Steady. - /// - public bool NodeAllocationFailed { get; set; } - /// - /// Gets a code for the first error encountered by the compute node in this attempt. See Microsoft.Azure.Batch.Common.BatchErrorCodeStrings - /// for possible values. - /// - public string NodeErrorCode { get; set; } - /// - /// Gets a list of additional error details related to the first error encountered by the compute node. - /// - public IEnumerable NodeErrorDetails { get; set; } - /// - /// Gets the current state of Azure Batch job. Is `null` if no batch jobs corresponding to the TES task was found. - /// - public JobState? JobState { get; set; } - /// - /// Gets the current state of the compute node. - /// - public ComputeNodeState? NodeState { get; set; } - /// - /// Gets the current state of the Azure Batch task. - /// - public TaskState? TaskState { get; set; } - /// - /// Gets the exit code of the program specified on the task command line. - /// - /// - /// This property is only returned if the task is in the Microsoft.Azure.Batch.Common.TaskState.Completed - /// state. The exit code for a process reflects the specific convention implemented - /// by the application developer for that process. If you use the exit code value - /// to make decisions in your code, be sure that you know the exit code convention - /// used by the application process. Note that the exit code may also be generated - /// by the compute node operating system, such as when a process is forcibly terminated. - /// - public int? TaskExitCode { get; set; } - /// - /// Gets the result of the task execution. - /// - /// - /// If the value is Microsoft.Azure.Batch.Common.TaskExecutionResult.Failure, then - /// the details of the failure can be found in the TaskFailureInformation - /// property. - /// - public TaskExecutionResult? TaskExecutionResult { get; set; } // TODO: consider removing - /// - /// Gets the time at which the task started running. - /// - public DateTime? TaskStartTime { get; set; } - /// - /// Gets the time at which the task completed. - /// - public DateTime? TaskEndTime { get; set; } - /// - /// Gets information describing the task failure, if any. - /// - /// - /// This property is set only if the task is in the Microsoft.Azure.Batch.Common.TaskState.Completed - /// state and encountered a failure. - /// - public TaskFailureInformation TaskFailureInformation { get; set; } - /// - /// Gets the state of the container under which the task is executing. - /// - /// - /// This is the state of the container according to the Docker service. It is equivilant - /// to the status field returned by "docker inspect". - /// - public string TaskContainerState { get; set; } // TODO: consider removing - /// - /// Gets detailed error information about the container under which the task is executing. - /// - /// - /// This is the detailed error string from the Docker service, if available. It is - /// equivilant to the error field returned by "docker inspect". - /// - public string TaskContainerError { get; set; } // TODO: consider removing - /// - /// Gets the task's pool Id. - /// - public PoolInformation Pool { get; set; } - /// - /// The Azure Batch Node Id - /// - public string NodeId { get; set; } - } -} diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 4d4699d3e..85657043e 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -236,142 +236,142 @@ public async Task DeleteBatchJobAsync(PoolInformation poolInformation, Cancellat await batchClient.JobOperations.DeleteJobAsync(poolInformation.PoolId, cancellationToken: cancellationToken); } - /// - [System.Diagnostics.CodeAnalysis.SuppressMessage("Performance", "CA1826:Do not use Enumerable methods on indexable collections", Justification = "FirstOrDefault() is straightforward, the alternative is less clear.")] - public async Task GetBatchJobAndTaskStateAsync(TesTask tesTask, CancellationToken cancellationToken) - { - try - { - string nodeErrorCode = null; - IEnumerable nodeErrorDetails = null; - var activeJobWithMissingAutoPool = false; - ComputeNodeState? nodeState = null; - TaskState? taskState = null; - string poolId = null; - TaskExecutionInformation taskExecutionInformation = null; - CloudJob job = null; - var attemptNumber = 0; - CloudTask batchTask = null; - - var jobOrTaskFilter = new ODATADetailLevel - { - FilterClause = $"startswith(id,'{tesTask.Id}{BatchJobAttemptSeparator}')", - SelectClause = "*" - }; - - if (string.IsNullOrWhiteSpace(tesTask.PoolId)) - { - return new AzureBatchJobAndTaskState { JobState = null }; - } - - try - { - job = await batchClient.JobOperations.GetJobAsync(tesTask.PoolId, cancellationToken: cancellationToken); - } - catch (BatchException ex) when (ex.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException e && e.Response.StatusCode == System.Net.HttpStatusCode.NotFound) - { - logger.LogError(ex, @"Failed to get job for TesTask {TesTask}", tesTask.Id); - return new AzureBatchJobAndTaskState { JobState = null }; - } - - var taskInfos = await batchClient.JobOperations.ListTasks(tesTask.PoolId, jobOrTaskFilter).ToAsyncEnumerable() - .Select(t => new { Task = t, AttemptNumber = int.Parse(t.Id.Split(BatchJobAttemptSeparator)[1]) }) - .ToListAsync(cancellationToken); - - if (!taskInfos.Any()) - { - logger.LogError(@"Failed to get task for TesTask {TesTask}", tesTask.Id); - } - else - { - if (taskInfos.Count(t => t.Task.State != TaskState.Completed) > 1) - { - return new AzureBatchJobAndTaskState { MoreThanOneActiveJobOrTaskFound = true }; - } - - var lastTaskInfo = taskInfos.OrderBy(t => t.AttemptNumber).Last(); - batchTask = lastTaskInfo.Task; - attemptNumber = lastTaskInfo.AttemptNumber; - } - - poolId = job.ExecutionInformation?.PoolId; - - - var nodeId = string.Empty; - - if (job.State == JobState.Active && poolId is not null) - { - var poolFilter = new ODATADetailLevel - { - SelectClause = "*" - }; - - CloudPool pool; - - try - { - pool = await batchClient.PoolOperations.GetPoolAsync(poolId, poolFilter, cancellationToken: cancellationToken); - } - catch (BatchException ex) when (ex.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException e && e.Response?.StatusCode == System.Net.HttpStatusCode.NotFound) - { - pool = default; - } - - if (pool is not null) - { - var node = await pool.ListComputeNodes().ToAsyncEnumerable().FirstOrDefaultAsync(n => (n.RecentTasks?.Select(t => t.TaskId) ?? Enumerable.Empty()).Contains(batchTask?.Id), cancellationToken); - - if (node is not null) - { - nodeId = node.Id; - nodeState = node.State; - var nodeError = node.Errors?.FirstOrDefault(e => "DiskFull".Equals(e.Code, StringComparison.InvariantCultureIgnoreCase)) ?? node.Errors?.FirstOrDefault(); // Prioritize DiskFull errors - nodeErrorCode = nodeError?.Code; - nodeErrorDetails = nodeError?.ErrorDetails?.Select(e => e.Value); - } - } - else - { - if (job.CreationTime.HasValue && DateTime.UtcNow.Subtract(job.CreationTime.Value) > TimeSpan.FromMinutes(30)) - { - activeJobWithMissingAutoPool = true; - } - } - } - - if (batchTask is not null) - { - taskState = batchTask.State; - taskExecutionInformation = batchTask.ExecutionInformation; - } - - return new AzureBatchJobAndTaskState - { - MoreThanOneActiveJobOrTaskFound = false, - ActiveJobWithMissingAutoPool = activeJobWithMissingAutoPool, - AttemptNumber = attemptNumber, - NodeErrorCode = nodeErrorCode, - NodeErrorDetails = nodeErrorDetails, - NodeState = nodeState, - JobState = job.State, - TaskState = taskState, - Pool = new() { PoolId = poolId }, - TaskExecutionResult = taskExecutionInformation?.Result, - TaskStartTime = taskExecutionInformation?.StartTime, - TaskEndTime = taskExecutionInformation?.EndTime, - TaskExitCode = taskExecutionInformation?.ExitCode, - TaskFailureInformation = taskExecutionInformation?.FailureInformation, - TaskContainerState = taskExecutionInformation?.ContainerInformation?.State, - TaskContainerError = taskExecutionInformation?.ContainerInformation?.Error, - NodeId = !string.IsNullOrEmpty(nodeId) ? nodeId : null - }; - } - catch (Exception ex) - { - logger.LogError(ex, @"GetBatchJobAndTaskStateAsync failed for TesTask {TesTask}", tesTask.Id); - throw; - } - } + ///// + //[System.Diagnostics.CodeAnalysis.SuppressMessage("Performance", "CA1826:Do not use Enumerable methods on indexable collections", Justification = "FirstOrDefault() is straightforward, the alternative is less clear.")] + //public async Task GetBatchJobAndTaskStateAsync(TesTask tesTask, CancellationToken cancellationToken) + //{ + // try + // { + // string nodeErrorCode = null; + // IEnumerable nodeErrorDetails = null; + // var activeJobWithMissingAutoPool = false; + // ComputeNodeState? nodeState = null; + // TaskState? taskState = null; + // string poolId = null; + // TaskExecutionInformation taskExecutionInformation = null; + // CloudJob job = null; + // var attemptNumber = 0; + // CloudTask batchTask = null; + + // var jobOrTaskFilter = new ODATADetailLevel + // { + // FilterClause = $"startswith(id,'{tesTask.Id}{BatchJobAttemptSeparator}')", + // SelectClause = "*" + // }; + + // if (string.IsNullOrWhiteSpace(tesTask.PoolId)) + // { + // return new AzureBatchJobAndTaskState { JobState = null }; + // } + + // try + // { + // job = await batchClient.JobOperations.GetJobAsync(tesTask.PoolId, cancellationToken: cancellationToken); + // } + // catch (BatchException ex) when (ex.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException e && e.Response.StatusCode == System.Net.HttpStatusCode.NotFound) + // { + // logger.LogError(ex, @"Failed to get job for TesTask {TesTask}", tesTask.Id); + // return new AzureBatchJobAndTaskState { JobState = null }; + // } + + // var taskInfos = await batchClient.JobOperations.ListTasks(tesTask.PoolId, jobOrTaskFilter).ToAsyncEnumerable() + // .Select(t => new { Task = t, AttemptNumber = int.Parse(t.Id.Split(BatchJobAttemptSeparator)[1]) }) + // .ToListAsync(cancellationToken); + + // if (!taskInfos.Any()) + // { + // logger.LogError(@"Failed to get task for TesTask {TesTask}", tesTask.Id); + // } + // else + // { + // if (taskInfos.Count(t => t.Task.State != TaskState.Completed) > 1) + // { + // return new AzureBatchJobAndTaskState { MoreThanOneActiveJobOrTaskFound = true }; + // } + + // var lastTaskInfo = taskInfos.OrderBy(t => t.AttemptNumber).Last(); + // batchTask = lastTaskInfo.Task; + // attemptNumber = lastTaskInfo.AttemptNumber; + // } + + // poolId = job.ExecutionInformation?.PoolId; + + + // var nodeId = string.Empty; + + // if (job.State == JobState.Active && poolId is not null) + // { + // var poolFilter = new ODATADetailLevel + // { + // SelectClause = "*" + // }; + + // CloudPool pool; + + // try + // { + // pool = await batchClient.PoolOperations.GetPoolAsync(poolId, poolFilter, cancellationToken: cancellationToken); + // } + // catch (BatchException ex) when (ex.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException e && e.Response?.StatusCode == System.Net.HttpStatusCode.NotFound) + // { + // pool = default; + // } + + // if (pool is not null) + // { + // var node = await pool.ListComputeNodes().ToAsyncEnumerable().FirstOrDefaultAsync(n => (n.RecentTasks?.Select(t => t.TaskId) ?? Enumerable.Empty()).Contains(batchTask?.Id), cancellationToken); + + // if (node is not null) + // { + // nodeId = node.Id; + // nodeState = node.State; + // var nodeError = node.Errors?.FirstOrDefault(e => "DiskFull".Equals(e.Code, StringComparison.InvariantCultureIgnoreCase)) ?? node.Errors?.FirstOrDefault(); // Prioritize DiskFull errors + // nodeErrorCode = nodeError?.Code; + // nodeErrorDetails = nodeError?.ErrorDetails?.Select(e => e.Value); + // } + // } + // else + // { + // if (job.CreationTime.HasValue && DateTime.UtcNow.Subtract(job.CreationTime.Value) > TimeSpan.FromMinutes(30)) + // { + // activeJobWithMissingAutoPool = true; + // } + // } + // } + + // if (batchTask is not null) + // { + // taskState = batchTask.State; + // taskExecutionInformation = batchTask.ExecutionInformation; + // } + + // return new AzureBatchJobAndTaskState + // { + // MoreThanOneActiveJobOrTaskFound = false, + // ActiveJobWithMissingAutoPool = activeJobWithMissingAutoPool, + // AttemptNumber = attemptNumber, + // NodeErrorCode = nodeErrorCode, + // NodeErrorDetails = nodeErrorDetails, + // NodeState = nodeState, + // JobState = job.State, + // TaskState = taskState, + // Pool = new() { PoolId = poolId }, + // TaskExecutionResult = taskExecutionInformation?.Result, + // TaskStartTime = taskExecutionInformation?.StartTime, + // TaskEndTime = taskExecutionInformation?.EndTime, + // TaskExitCode = taskExecutionInformation?.ExitCode, + // TaskFailureInformation = taskExecutionInformation?.FailureInformation, + // TaskContainerState = taskExecutionInformation?.ContainerInformation?.State, + // TaskContainerError = taskExecutionInformation?.ContainerInformation?.Error, + // NodeId = !string.IsNullOrEmpty(nodeId) ? nodeId : null + // }; + // } + // catch (Exception ex) + // { + // logger.LogError(ex, @"GetBatchJobAndTaskStateAsync failed for TesTask {TesTask}", tesTask.Id); + // throw; + // } + //} /// public async Task DeleteBatchJobAsync(string tesTaskId, CancellationToken cancellationToken = default) diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 2c32a3d02..865add3c5 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Runtime.CompilerServices; using System.Threading; using System.Threading.Tasks; using Azure; @@ -497,38 +498,44 @@ async ValueTask PerformTask(ValueTask serviceAction, CancellationToken can catch (Exception ex) { exceptions.Add(ex); - return await RemoveMissingPools(ex); + return await RemoveMissingPoolsAsync(ex, cancellationToken); } } return false; } + } + + /// + public IAsyncEnumerable GetCompletedTasks(CancellationToken _1) + { + return _azureProxy.ListTasksAsync(Pool.PoolId, new ODATADetailLevel { FilterClause = "id,executionInfo", SelectClause = "state eq completed" }); + } - async ValueTask RemoveMissingPools(Exception ex) + private async ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken cancellationToken) + { + switch (ex) { - switch (ex) - { - case AggregateException aggregateException: - var result = true; - foreach (var e in aggregateException.InnerExceptions) - { - result &= await RemoveMissingPools(e); - } - return result; + case AggregateException aggregateException: + var result = true; + foreach (var e in aggregateException.InnerExceptions) + { + result &= await RemoveMissingPoolsAsync(e, cancellationToken); + } + return result; - case BatchException batchException: - if (batchException.RequestInformation.BatchError.Code == BatchErrorCodeStrings.PoolNotFound) - { - _logger.LogError(ex, "Batch pool {PoolId} is missing. Removing it from TES's active pool list.", Pool.PoolId); - _ = _batchPools.RemovePoolFromList(this); - // TODO: Consider moving any remaining tasks to another pool, or failing tasks explicitly - await _batchPools.DeletePoolAsync(this, cancellationToken); // Ensure job removal too - return false; - } - break; - } - return true; + case BatchException batchException: + if (batchException.RequestInformation.BatchError.Code == BatchErrorCodeStrings.PoolNotFound) + { + _logger.LogError(ex, "Batch pool {PoolId} is missing. Removing it from TES's active pool list.", Pool.PoolId); + _ = _batchPools.RemovePoolFromList(this); + // TODO: Consider moving any remaining tasks to another pool, or failing tasks explicitly + await _batchPools.DeletePoolAsync(this, cancellationToken); // Ensure job removal too + return false; + } + break; } + return true; } /// diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index ef1b41772..109c1ff89 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -2,12 +2,15 @@ // Licensed under the MIT License. using System; +using System.Collections.Generic; using System.Linq; +using System.Runtime.CompilerServices; using System.Threading; using System.Threading.Tasks; -using Microsoft.Extensions.Hosting; +using Microsoft.Azure.Batch; using Microsoft.Extensions.Logging; -using Microsoft.Extensions.Options; +using Tes.Models; +using Tes.Repository; namespace TesApi.Web { @@ -16,11 +19,8 @@ namespace TesApi.Web /// This should only be used as a system-wide singleton service. This class does not support scale-out on multiple machines, /// nor does it implement a leasing mechanism. In the future, consider using the Lease Blob operation. /// - public class BatchPoolService : BackgroundService + internal class BatchPoolService : OrchestrateOnBatchSchedulerService { - private readonly IBatchScheduler _batchScheduler; - private readonly ILogger _logger; - /// /// Interval between each call to . /// @@ -29,77 +29,119 @@ public class BatchPoolService : BackgroundService /// /// Default constructor /// + /// The main TES task database repository implementation /// /// /// - public BatchPoolService(IBatchScheduler batchScheduler, ILogger logger) + public BatchPoolService(IRepository repository, IBatchScheduler batchScheduler, ILogger logger) + : base(repository, batchScheduler, logger) { } + + /// + protected override void ExecuteSetup(CancellationToken stoppingToken) { - _batchScheduler = batchScheduler ?? throw new ArgumentNullException(nameof(batchScheduler)); - _logger = logger ?? throw new ArgumentNullException(nameof(logger)); + batchScheduler.LoadExistingPoolsAsync(stoppingToken).Wait(stoppingToken); // Delay starting Scheduler until this completes to finish initializing BatchScheduler. } /// - public override Task StopAsync(CancellationToken cancellationToken) + protected override Task ExecuteCoreAsync(CancellationToken stoppingToken) { - _logger.LogInformation("Batch Pools stopping..."); - return base.StopAsync(cancellationToken); + return Task.WhenAll(ServiceBatchPoolsAsync(stoppingToken), ExecuteCompletedTesTasksOnBatchAsync(stoppingToken)); } - /// - protected override async Task ExecuteAsync(CancellationToken stoppingToken) + /// + /// Performs an action on each batch pool. + /// + /// + /// + /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// + private async ValueTask ExecuteActionOnPoolsAsync(string pollName, Func action, CancellationToken stoppingToken) { - _logger.LogInformation("Batch Pools started."); - _batchScheduler.LoadExistingPoolsAsync(stoppingToken).Wait(stoppingToken); // Delay starting Scheduler until this completes to finish initializing BatchScheduler. + ArgumentNullException.ThrowIfNull(action); + + var pools = batchScheduler.GetPools().ToList(); - while (!stoppingToken.IsCancellationRequested) + if (0 == pools.Count) + { + return; + } + + var startTime = DateTime.UtcNow; + + foreach (var pool in pools) { try { - await ServiceBatchPools(stoppingToken); - await Task.Delay(RunInterval, stoppingToken); - } - catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested) - { - break; + await action(pool, stoppingToken); } catch (Exception exc) { - _logger.LogError(exc, @"{ExceptionMessage}", exc.Message); + logger.LogError(exc, @"Batch pool {PoolId} threw an exception in {Poll}.", pool.Pool?.PoolId, pollName); } } - _logger.LogInformation("Batch Pools gracefully stopped."); + logger.LogDebug(@"{Poll} for {PoolsCount} pools completed in {TotalSeconds} seconds.", pollName, pools.Count, DateTime.UtcNow.Subtract(startTime).TotalSeconds); + } + + /// + /// Calls repeatedly. + /// + /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// + private Task ServiceBatchPoolsAsync(CancellationToken stoppingToken) + { + return ExecuteActionOnIntervalAsync(RunInterval, ExecuteServiceBatchPoolsAsync, stoppingToken); } /// /// Retrieves all batch pools from the database and affords an opportunity to react to changes. /// - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. /// - private async ValueTask ServiceBatchPools(CancellationToken cancellationToken) + private ValueTask ExecuteServiceBatchPoolsAsync(CancellationToken stoppingToken) { - var pools = _batchScheduler.GetPools().ToList(); + return ExecuteActionOnPoolsAsync("ServiceBatchPools", (pool, token) => pool.ServicePoolAsync(token), stoppingToken); + } - if (0 == pools.Count) - { - return; - } + /// + /// Calls repeatedly. + /// + /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// + private Task ExecuteCompletedTesTasksOnBatchAsync(CancellationToken stoppingToken) + { + return ExecuteActionOnIntervalAsync(RunInterval, ProcessCompletedCloudTasksAsync, stoppingToken); + } - var startTime = DateTime.UtcNow; + /// + /// Retrieves all completed tasks from every batch pools from the database and affords an opportunity to react to changes. + /// + /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// + private async ValueTask ProcessCompletedCloudTasksAsync(CancellationToken stoppingToken) + { + var tasks = new List(); + await ExecuteActionOnPoolsAsync("ServiceBatchTasks", async (pool, token) => tasks.AddRange(await pool.GetCompletedTasks(token).ToListAsync(token)), stoppingToken); - foreach (var pool in pools) + await OrchestrateTesTasksOnBatchAsync( + "Completed", +#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously + async token => GetTesTasks(token), +#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + (tesTasks, token) => batchScheduler.ProcessCompletedTesTasksAsync(tesTasks, tasks.ToArray(), token), + stoppingToken); + + async IAsyncEnumerable GetTesTasks([EnumeratorCancellation] CancellationToken cancellationToken) { - try + foreach (var id in tasks.Select(t => t.Id)) { - await pool.ServicePoolAsync(cancellationToken); - } - catch (Exception exc) - { - _logger.LogError(exc, "Batch pool {PoolId} threw an exception in ServiceBatchPools.", pool.Pool?.PoolId); + TesTask tesTask = default; + if (await repository.TryGetItemAsync(id, cancellationToken, task => tesTask = task) && tesTask is not null) + { + yield return tesTask; + } } } - - _logger.LogDebug(@"ServiceBatchPools for {PoolsCount} pools completed in {TotalSeconds} seconds.", pools.Count, DateTime.UtcNow.Subtract(startTime).TotalSeconds); } } } diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index a6ef552cf..a6d48e081 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -186,87 +186,83 @@ public BatchScheduler( //static bool tesTaskCancellationRequested(TesTask tesTask) => tesTask.State == TesState.CANCELINGEnum; //static bool tesTaskDeletionReady(TesTask tesTask) => tesTask.IsTaskDeletionRequired; - static void SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo) - { - tesTask.State = newTaskState; - - var tesTaskLog = tesTask.GetOrAddTesTaskLog(); - var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); - - tesTaskLog.BatchNodeMetrics = batchInfo.BatchNodeMetrics; - tesTaskLog.CromwellResultCode = batchInfo.CromwellRcCode; - tesTaskLog.EndTime = DateTime.UtcNow; - tesTaskExecutorLog.StartTime = batchInfo.BatchTaskStartTime; - tesTaskExecutorLog.EndTime = batchInfo.BatchTaskEndTime; - tesTaskExecutorLog.ExitCode = batchInfo.BatchTaskExitCode; - - // Only accurate when the task completes successfully, otherwise it's the Batch time as reported from Batch - // TODO this could get large; why? - //var timefromCoAScriptCompletionToBatchTaskDetectedComplete = tesTaskLog.EndTime - tesTaskExecutorLog.EndTime; - - tesTask.SetFailureReason(batchInfo.FailureReason); - - if (batchInfo.SystemLogItems is not null) - { - tesTask.AddToSystemLog(batchInfo.SystemLogItems); - } - else if (!string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem)) - { - tesTask.AddToSystemLog(new[] { batchInfo.AlternateSystemLogItem }); - } - } - - async Task SetTaskCompleted(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) - { - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - SetTaskStateAndLog(tesTask, TesState.COMPLETEEnum, batchInfo); - } + //static void SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo) + //{ + // tesTask.State = newTaskState; + + // var tesTaskLog = tesTask.GetOrAddTesTaskLog(); + // var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); + + // tesTaskLog.BatchNodeMetrics = batchInfo.BatchNodeMetrics; + // tesTaskLog.CromwellResultCode = batchInfo.CromwellRcCode; + // tesTaskLog.EndTime = DateTime.UtcNow; + // tesTaskExecutorLog.StartTime = batchInfo.BatchTaskStartTime; + // tesTaskExecutorLog.EndTime = batchInfo.BatchTaskEndTime; + // tesTaskExecutorLog.ExitCode = batchInfo.BatchTaskExitCode; + + // // Only accurate when the task completes successfully, otherwise it's the Batch time as reported from Batch + // // TODO this could get large; why? + // //var timefromCoAScriptCompletionToBatchTaskDetectedComplete = tesTaskLog.EndTime - tesTaskExecutorLog.EndTime; + + // tesTask.SetFailureReason(batchInfo.FailureReason); + + // if (batchInfo.SystemLogItems is not null) + // { + // tesTask.AddToSystemLog(batchInfo.SystemLogItems); + // } + // else if (!string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem)) + // { + // tesTask.AddToSystemLog(new[] { batchInfo.AlternateSystemLogItem }); + // } + //} + + //async Task SetTaskCompleted(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) + //{ + // await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + // SetTaskStateAndLog(tesTask, TesState.COMPLETEEnum, batchInfo); + //} - async Task SetTaskExecutorError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) - { - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - SetTaskStateAndLog(tesTask, TesState.EXECUTORERROREnum, batchInfo); - } + //async Task SetTaskExecutorError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) + //{ + // await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + // SetTaskStateAndLog(tesTask, TesState.EXECUTORERROREnum, batchInfo); + //} - async Task SetTaskSystemError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) - { - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - SetTaskStateAndLog(tesTask, TesState.SYSTEMERROREnum, batchInfo); - } + //async Task SetTaskSystemError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) + //{ + // await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + // SetTaskStateAndLog(tesTask, TesState.SYSTEMERROREnum, batchInfo); + //} - async Task DeleteBatchJobAndSetTaskStateAsync(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) - { - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - SetTaskStateAndLog(tesTask, newTaskState, batchInfo); - } + //async Task DeleteBatchJobAndSetTaskStateAsync(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) + //{ + // await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + // SetTaskStateAndLog(tesTask, newTaskState, batchInfo); + //} - Task DeleteBatchJobAndSetTaskExecutorErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) => DeleteBatchJobAndSetTaskStateAsync(tesTask, TesState.EXECUTORERROREnum, batchInfo, cancellationToken); - Task DeleteBatchJobAndSetTaskSystemErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) => DeleteBatchJobAndSetTaskStateAsync(tesTask, TesState.SYSTEMERROREnum, batchInfo, cancellationToken); + //Task DeleteBatchJobAndSetTaskExecutorErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) => DeleteBatchJobAndSetTaskStateAsync(tesTask, TesState.EXECUTORERROREnum, batchInfo, cancellationToken); + //Task DeleteBatchJobAndSetTaskSystemErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) => DeleteBatchJobAndSetTaskStateAsync(tesTask, TesState.SYSTEMERROREnum, batchInfo, cancellationToken); - Task DeleteBatchJobAndRequeueTaskAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) - => ++tesTask.ErrorCount > 3 - ? AddSystemLogAndDeleteBatchJobAndSetTaskExecutorErrorAsync(tesTask, batchInfo, "System Error: Retry count exceeded.", cancellationToken) - : DeleteBatchJobAndSetTaskStateAsync(tesTask, TesState.QUEUEDEnum, batchInfo, cancellationToken); + //Task DeleteBatchJobAndRequeueTaskAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) + // => ++tesTask.ErrorCount > 3 + // ? AddSystemLogAndDeleteBatchJobAndSetTaskExecutorErrorAsync(tesTask, batchInfo, "System Error: Retry count exceeded.", cancellationToken) + // : DeleteBatchJobAndSetTaskStateAsync(tesTask, TesState.QUEUEDEnum, batchInfo, cancellationToken); - Task AddSystemLogAndDeleteBatchJobAndSetTaskExecutorErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, string alternateSystemLogItem, CancellationToken cancellationToken) - { - batchInfo.SystemLogItems ??= Enumerable.Empty().Append(alternateSystemLogItem); - return DeleteBatchJobAndSetTaskExecutorErrorAsync(tesTask, batchInfo, cancellationToken); - } + //Task AddSystemLogAndDeleteBatchJobAndSetTaskExecutorErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, string alternateSystemLogItem, CancellationToken cancellationToken) + //{ + // batchInfo.SystemLogItems ??= Enumerable.Empty().Append(alternateSystemLogItem); + // return DeleteBatchJobAndSetTaskExecutorErrorAsync(tesTask, batchInfo, cancellationToken); + //} - Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) - { - logger.LogInformation("The TesTask {TesTask}'s node was preempted. It will be automatically rescheduled.", tesTask.Id); - tesTask.State = TesState.INITIALIZINGEnum; - return Task.FromResult(false); - } + //Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) + //{ + // logger.LogInformation("The TesTask {TesTask}'s node was preempted. It will be automatically rescheduled.", tesTask.Id); + // tesTask.State = TesState.INITIALIZINGEnum; + // return Task.FromResult(false); + //} //tesTaskStateTransitions = new List() //{ - // new TesTaskStateTransition(tesTaskDeletionReady, batchTaskState: null, alternateSystemLogItem: null, DeleteCancelledTaskAsync), - // new TesTaskStateTransition(tesTaskCancellationRequested, batchTaskState: null, alternateSystemLogItem: null, CancelTaskAsync), - // new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.JobNotFound, alternateSystemLogItem: null, (tesTask, _, ct) => AddBatchTaskAsync(tesTask, ct)), - // new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.MissingBatchTask, alternateSystemLogItem: null, (tesTask, batchInfo, ct) => AddBatchTaskAsync(tesTask, ct)), // new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.Initializing, alternateSystemLogItem: null, (tesTask, _) => tesTask.State = TesState.INITIALIZINGEnum), // new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, BatchTaskState.NodeAllocationFailed, alternateSystemLogItem: null, DeleteBatchJobAndRequeueTaskAsync), // new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, BatchTaskState.Running, alternateSystemLogItem: null, (tesTask, _) => tesTask.State = TesState.RUNNINGEnum), @@ -366,33 +362,79 @@ public async Task UploadTaskRunnerIfNeeded(CancellationToken cancellationToken) } } - ///// - ///// Iteratively manages execution of a on Azure Batch until completion or failure - ///// - ///// The - ///// A for controlling the lifetime of the asynchronous operation. - ///// True if the TES task needs to be persisted. - //public async ValueTask ProcessTesTaskAsync(TesTask tesTask, CancellationToken cancellationToken) - //{ - // var combinedBatchTaskInfo = await GetBatchTaskStateAsync(tesTask, cancellationToken); - // const string template = "TES task: {TesTask} TES task state: {TesTaskState} BatchTaskState: {BatchTaskState}"; - // var msg = string.Format(ConvertTemplateToFormat(template), tesTask.Id, tesTask.State.ToString(), combinedBatchTaskInfo.BatchTaskState.ToString()); + /// + public async IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCompletedTesTasksAsync(IEnumerable tesTasks, CloudTask[] cloudTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + { + ArgumentNullException.ThrowIfNull(tesTasks); + ArgumentNullException.ThrowIfNull(cloudTasks); - // if (onlyLogBatchTaskStateOnce.Add(msg)) - // { - // logger.LogInformation(template, tesTask.Id, tesTask.State.ToString(), combinedBatchTaskInfo.BatchTaskState.ToString()); - // } + foreach (var (cloudTask, tesTask) in cloudTasks.Zip(tesTasks)) + { + // Skip if this task was previously updated. + if (tesTask.State != TesState.COMPLETEEnum && tesTask.State != TesState.SYSTEMERROREnum && tesTask.State != TesState.EXECUTORERROREnum) + { + try + { + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + } + catch (Exception ex) + { + logger.LogError(ex, @"Failure when deleting cloud task for TES task: {TesTask} Exception: {ExceptionType}: {ExceptionMessage}", tesTask.Id, ex.GetType().FullName, ex.Message); + tesTask.IsTaskDeletionRequired = true; + } - // return await HandleTesTaskTransitionAsync(tesTask, combinedBatchTaskInfo, cancellationToken); + if (cloudTask.ExecutionInformation.ExitCode != 0 || cloudTask.ExecutionInformation.FailureInformation is not null) + { + await SetTaskStateAndLog(tesTask, TesState.EXECUTORERROREnum, cloudTask); + } + else + { + await SetTaskStateAndLog(tesTask, TesState.COMPLETEEnum, cloudTask); + } - // static string ConvertTemplateToFormat(string template) - // => string.Join(null, template.Split('{', '}').Select((s, i) => (s, i)).Select(t => t.i % 2 == 0 ? t.s : $"{{{t.i / 2}}}")); - //} + yield return (tesTask, Task.FromResult(true)); + } + } - /// - public async IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCompletedTesTasksAsync(IEnumerable tesTasks, CloudTask[] cloudTask, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) - { - yield break; + async ValueTask SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CloudTask cloudTask) + { + var metrics = newTaskState == TesState.COMPLETEEnum + ? await GetBatchNodeMetricsAndCromwellResultCodeAsync(tesTask, cancellationToken) + : default; + + tesTask.State = newTaskState; + + var tesTaskLog = tesTask.GetOrAddTesTaskLog(); + var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); + + tesTaskLog.BatchNodeMetrics = metrics.BatchNodeMetrics; + tesTaskLog.CromwellResultCode = metrics.CromwellRcCode; + tesTaskLog.EndTime = DateTime.UtcNow; + tesTaskExecutorLog.StartTime = metrics.TaskStartTime ?? cloudTask.ExecutionInformation?.StartTime; + tesTaskExecutorLog.EndTime = metrics.TaskEndTime ?? cloudTask.ExecutionInformation?.EndTime; + tesTaskExecutorLog.ExitCode = cloudTask.ExecutionInformation?.ExitCode; + + // Only accurate when the task completes successfully, otherwise it's the Batch time as reported from Batch + // TODO this could get large; why? + //var timefromCoAScriptCompletionToBatchTaskDetectedComplete = tesTaskLog.EndTime - tesTaskExecutorLog.EndTime; + + tesTask.SetFailureReason(cloudTask.ExecutionInformation?.FailureInformation?.Code); + + var systemLogItems = newTaskState == TesState.COMPLETEEnum + ? default + : Enumerable.Empty() + .Append($"Batch task ExitCode: {cloudTask.ExecutionInformation?.ExitCode}, Failure message: {cloudTask.ExecutionInformation?.FailureInformation?.Message}") + .Concat(cloudTask.ExecutionInformation?.FailureInformation?.Details?.Select(d => $"{d.Name}: {d.Value}") ?? Enumerable.Empty()); + + if (systemLogItems is not null) + { + tesTask.AddToSystemLog(systemLogItems); + } + else if (newTaskState != TesState.COMPLETEEnum) + { + tesTask.AddToSystemLog(new[] { "Please open an issue. There should have been an error reported here." }); + } + } } /// @@ -732,239 +774,239 @@ Task HandleException(Exception exception, string poolKey, TesTask tesTask) } } - /// - /// Gets the current state of the Azure Batch task - /// - /// - /// A for controlling the lifetime of the asynchronous operation. - /// A higher-level abstraction of the current state of the Azure Batch task - [System.Diagnostics.CodeAnalysis.SuppressMessage("Performance", "CA1826:Do not use Enumerable methods on indexable collections", Justification = "FirstOrDefault() is straightforward, the alternative is less clear.")] - private async ValueTask GetBatchTaskStateAsync(TesTask tesTask, CancellationToken cancellationToken) - { - var azureBatchJobAndTaskState = await azureProxy.GetBatchJobAndTaskStateAsync(tesTask, cancellationToken); - - if (azureBatchJobAndTaskState.Pool?.PoolId is null) - { - azureBatchJobAndTaskState.Pool = tesTask.PoolId is null ? default : new() { PoolId = tesTask.PoolId }; - } - - static IEnumerable ConvertNodeErrorsToSystemLogItems(AzureBatchJobAndTaskState azureBatchJobAndTaskState) - { - var systemLogItems = new List(); - - if (azureBatchJobAndTaskState.NodeErrorCode is not null) - { - systemLogItems.Add(azureBatchJobAndTaskState.NodeErrorCode); - } - - if (azureBatchJobAndTaskState.NodeErrorDetails is not null) - { - systemLogItems.AddRange(azureBatchJobAndTaskState.NodeErrorDetails); - } - - return systemLogItems; - } - - if (azureBatchJobAndTaskState.ActiveJobWithMissingAutoPool) - { - logger.LogWarning("Found active job without auto pool for TES task {TesTask}. Deleting the job and requeuing the task. BatchJobInfo: {BatchJobInfo}", tesTask.Id, JsonConvert.SerializeObject(azureBatchJobAndTaskState)); - return new CombinedBatchTaskInfo - { - BatchTaskState = BatchTaskState.ActiveJobWithMissingAutoPool, - FailureReason = BatchTaskState.ActiveJobWithMissingAutoPool.ToString(), - Pool = azureBatchJobAndTaskState.Pool - }; - } - - if (azureBatchJobAndTaskState.MoreThanOneActiveJobOrTaskFound) - { - return new CombinedBatchTaskInfo - { - BatchTaskState = BatchTaskState.MoreThanOneActiveJobOrTaskFound, - FailureReason = BatchTaskState.MoreThanOneActiveJobOrTaskFound.ToString(), - Pool = azureBatchJobAndTaskState.Pool - }; - } + ///// + ///// Gets the current state of the Azure Batch task + ///// + ///// + ///// A for controlling the lifetime of the asynchronous operation. + ///// A higher-level abstraction of the current state of the Azure Batch task + //[System.Diagnostics.CodeAnalysis.SuppressMessage("Performance", "CA1826:Do not use Enumerable methods on indexable collections", Justification = "FirstOrDefault() is straightforward, the alternative is less clear.")] + //private async ValueTask GetBatchTaskStateAsync(TesTask tesTask, CloudTask cloudTask, CancellationToken cancellationToken) + //{ + // var azureBatchJobAndTaskState = await azureProxy.GetBatchJobAndTaskStateAsync(tesTask, cancellationToken); - // Because a ComputeTask is not assigned to the compute node while the StartTask is running, IAzureProxy.GetBatchJobAndTaskStateAsync() does not see start task failures. Deal with that here. - if (azureBatchJobAndTaskState.NodeState is null && azureBatchJobAndTaskState.JobState == JobState.Active && azureBatchJobAndTaskState.TaskState == TaskState.Active && !string.IsNullOrWhiteSpace(azureBatchJobAndTaskState.Pool?.PoolId)) - { - /* - * Priority order for assigning errors to TesTasks in shared-pool mode: - * 1. Node error found in GetBatchJobAndTaskStateAsync() - * 2. StartTask failure - * 3. NodeAllocation failure - */ - if (TryGetPool(azureBatchJobAndTaskState.Pool.PoolId, out var pool)) - { - if (!string.IsNullOrWhiteSpace(azureBatchJobAndTaskState.NodeErrorCode) || !ProcessStartTaskFailure(pool.PopNextStartTaskFailure())) - { - var resizeError = pool.PopNextResizeError(); - if (resizeError is not null) - { - azureBatchJobAndTaskState.NodeAllocationFailed = true; - azureBatchJobAndTaskState.NodeErrorCode = resizeError.Code; - azureBatchJobAndTaskState.NodeErrorDetails = Enumerable.Repeat(resizeError.Message, string.IsNullOrWhiteSpace(resizeError.Message) ? 1 : 0).Concat(resizeError.Values?.Select(d => d.Value) ?? Enumerable.Empty()); - } - } - } + // if (azureBatchJobAndTaskState.Pool?.PoolId is null) + // { + // azureBatchJobAndTaskState.Pool = tesTask.PoolId is null ? default : new() { PoolId = tesTask.PoolId }; + // } - bool ProcessStartTaskFailure(TaskFailureInformation failureInformation) - { - if (failureInformation is not null) - { - azureBatchJobAndTaskState.NodeState = ComputeNodeState.StartTaskFailed; - azureBatchJobAndTaskState.NodeErrorCode = failureInformation.Code; - azureBatchJobAndTaskState.NodeErrorDetails = failureInformation.Details?.Select(d => d.Value); - } + // static IEnumerable ConvertNodeErrorsToSystemLogItems(AzureBatchJobAndTaskState azureBatchJobAndTaskState) + // { + // var systemLogItems = new List(); - return failureInformation is not null; - } - } + // if (azureBatchJobAndTaskState.NodeErrorCode is not null) + // { + // systemLogItems.Add(azureBatchJobAndTaskState.NodeErrorCode); + // } - if (TaskFailureInformationCodes.DiskFull.Equals(azureBatchJobAndTaskState.NodeErrorCode, StringComparison.OrdinalIgnoreCase)) - { - azureBatchJobAndTaskState.NodeErrorDetails = (azureBatchJobAndTaskState.NodeErrorDetails ?? Enumerable.Empty()) - .Append($"Compute Node Error: {TaskFailureInformationCodes.DiskFull} Id: {azureBatchJobAndTaskState.NodeId}"); - } + // if (azureBatchJobAndTaskState.NodeErrorDetails is not null) + // { + // systemLogItems.AddRange(azureBatchJobAndTaskState.NodeErrorDetails); + // } - switch (azureBatchJobAndTaskState.JobState) - { - case null: - case JobState.Deleting: - return new CombinedBatchTaskInfo - { - BatchTaskState = BatchTaskState.JobNotFound, - FailureReason = BatchTaskState.JobNotFound.ToString(), - Pool = azureBatchJobAndTaskState.Pool - }; - case JobState.Active: - { - if (azureBatchJobAndTaskState.NodeAllocationFailed) - { - return new CombinedBatchTaskInfo - { - BatchTaskState = BatchTaskState.NodeAllocationFailed, - FailureReason = BatchTaskState.NodeAllocationFailed.ToString(), - SystemLogItems = ConvertNodeErrorsToSystemLogItems(azureBatchJobAndTaskState), - Pool = azureBatchJobAndTaskState.Pool - }; - } + // return systemLogItems; + // } - if (azureBatchJobAndTaskState.NodeState == ComputeNodeState.Unusable) - { - return new CombinedBatchTaskInfo - { - BatchTaskState = BatchTaskState.NodeUnusable, - FailureReason = BatchTaskState.NodeUnusable.ToString(), - SystemLogItems = ConvertNodeErrorsToSystemLogItems(azureBatchJobAndTaskState), - Pool = azureBatchJobAndTaskState.Pool - }; - } + // if (azureBatchJobAndTaskState.ActiveJobWithMissingAutoPool) + // { + // logger.LogWarning("Found active job without auto pool for TES task {TesTask}. Deleting the job and requeuing the task. BatchJobInfo: {BatchJobInfo}", tesTask.Id, JsonConvert.SerializeObject(azureBatchJobAndTaskState)); + // return new CombinedBatchTaskInfo + // { + // BatchTaskState = BatchTaskState.ActiveJobWithMissingAutoPool, + // FailureReason = BatchTaskState.ActiveJobWithMissingAutoPool.ToString(), + // Pool = azureBatchJobAndTaskState.Pool + // }; + // } - if (azureBatchJobAndTaskState.NodeState == ComputeNodeState.Preempted) - { - return new CombinedBatchTaskInfo - { - BatchTaskState = BatchTaskState.NodePreempted, - FailureReason = BatchTaskState.NodePreempted.ToString(), - SystemLogItems = ConvertNodeErrorsToSystemLogItems(azureBatchJobAndTaskState), - Pool = azureBatchJobAndTaskState.Pool - }; - } + // if (azureBatchJobAndTaskState.MoreThanOneActiveJobOrTaskFound) + // { + // return new CombinedBatchTaskInfo + // { + // BatchTaskState = BatchTaskState.MoreThanOneActiveJobOrTaskFound, + // FailureReason = BatchTaskState.MoreThanOneActiveJobOrTaskFound.ToString(), + // Pool = azureBatchJobAndTaskState.Pool + // }; + // } - if (azureBatchJobAndTaskState.NodeErrorCode is not null) - { - if (azureBatchJobAndTaskState.NodeErrorCode == TaskFailureInformationCodes.DiskFull) - { - return new CombinedBatchTaskInfo - { - BatchTaskState = BatchTaskState.NodeFailedDuringStartupOrExecution, - FailureReason = azureBatchJobAndTaskState.NodeErrorCode, - SystemLogItems = ConvertNodeErrorsToSystemLogItems(azureBatchJobAndTaskState), - Pool = azureBatchJobAndTaskState.Pool - }; - } - else - { - return new CombinedBatchTaskInfo - { - BatchTaskState = BatchTaskState.NodeFailedDuringStartupOrExecution, - FailureReason = BatchTaskState.NodeFailedDuringStartupOrExecution.ToString(), - SystemLogItems = ConvertNodeErrorsToSystemLogItems(azureBatchJobAndTaskState), - Pool = azureBatchJobAndTaskState.Pool - }; - } - } + // // Because a ComputeTask is not assigned to the compute node while the StartTask is running, IAzureProxy.GetBatchJobAndTaskStateAsync() does not see start task failures. Deal with that here. + // if (azureBatchJobAndTaskState.NodeState is null && azureBatchJobAndTaskState.JobState == JobState.Active && azureBatchJobAndTaskState.TaskState == TaskState.Active && !string.IsNullOrWhiteSpace(azureBatchJobAndTaskState.Pool?.PoolId)) + // { + // /* + // * Priority order for assigning errors to TesTasks in shared-pool mode: + // * 1. Node error found in GetBatchJobAndTaskStateAsync() + // * 2. StartTask failure + // * 3. NodeAllocation failure + // */ + // if (TryGetPool(azureBatchJobAndTaskState.Pool.PoolId, out var pool)) + // { + // if (!string.IsNullOrWhiteSpace(azureBatchJobAndTaskState.NodeErrorCode) || !ProcessStartTaskFailure(pool.PopNextStartTaskFailure())) + // { + // var resizeError = pool.PopNextResizeError(); + // if (resizeError is not null) + // { + // azureBatchJobAndTaskState.NodeAllocationFailed = true; + // azureBatchJobAndTaskState.NodeErrorCode = resizeError.Code; + // azureBatchJobAndTaskState.NodeErrorDetails = Enumerable.Repeat(resizeError.Message, string.IsNullOrWhiteSpace(resizeError.Message) ? 1 : 0).Concat(resizeError.Values?.Select(d => d.Value) ?? Enumerable.Empty()); + // } + // } + // } - break; - } - case JobState.Terminating: - case JobState.Completed: - break; - default: - throw new Exception($"Found batch job {tesTask.Id} in unexpected state: {azureBatchJobAndTaskState.JobState}"); - } + // bool ProcessStartTaskFailure(TaskFailureInformation failureInformation) + // { + // if (failureInformation is not null) + // { + // azureBatchJobAndTaskState.NodeState = ComputeNodeState.StartTaskFailed; + // azureBatchJobAndTaskState.NodeErrorCode = failureInformation.Code; + // azureBatchJobAndTaskState.NodeErrorDetails = failureInformation.Details?.Select(d => d.Value); + // } + + // return failureInformation is not null; + // } + // } - switch (azureBatchJobAndTaskState.TaskState) - { - case null: - return new CombinedBatchTaskInfo - { - BatchTaskState = BatchTaskState.MissingBatchTask, - FailureReason = BatchTaskState.MissingBatchTask.ToString(), - Pool = azureBatchJobAndTaskState.Pool - }; - case TaskState.Active: - case TaskState.Preparing: - return new CombinedBatchTaskInfo - { - BatchTaskState = BatchTaskState.Initializing, - Pool = azureBatchJobAndTaskState.Pool - }; - case TaskState.Running: - return new CombinedBatchTaskInfo - { - BatchTaskState = BatchTaskState.Running, - Pool = azureBatchJobAndTaskState.Pool - }; - case TaskState.Completed: - if (azureBatchJobAndTaskState.TaskExitCode == 0 && azureBatchJobAndTaskState.TaskFailureInformation is null) - { - var metrics = await GetBatchNodeMetricsAndCromwellResultCodeAsync(tesTask, cancellationToken); + // if (TaskFailureInformationCodes.DiskFull.Equals(azureBatchJobAndTaskState.NodeErrorCode, StringComparison.OrdinalIgnoreCase)) + // { + // azureBatchJobAndTaskState.NodeErrorDetails = (azureBatchJobAndTaskState.NodeErrorDetails ?? Enumerable.Empty()) + // .Append($"Compute Node Error: {TaskFailureInformationCodes.DiskFull} Id: {azureBatchJobAndTaskState.NodeId}"); + // } - return new CombinedBatchTaskInfo - { - BatchTaskState = BatchTaskState.CompletedSuccessfully, - BatchTaskExitCode = azureBatchJobAndTaskState.TaskExitCode, - BatchTaskStartTime = metrics.TaskStartTime ?? azureBatchJobAndTaskState.TaskStartTime, - BatchTaskEndTime = metrics.TaskEndTime ?? azureBatchJobAndTaskState.TaskEndTime, - BatchNodeMetrics = metrics.BatchNodeMetrics, - CromwellRcCode = metrics.CromwellRcCode, - Pool = azureBatchJobAndTaskState.Pool - }; - } - else - { - logger.LogError("Task {TesTask} failed. ExitCode: {TaskExitCode}, BatchJobInfo: {BatchJobInfo}", tesTask.Id, azureBatchJobAndTaskState.TaskExitCode, JsonConvert.SerializeObject(azureBatchJobAndTaskState)); + // switch (azureBatchJobAndTaskState.JobState) + // { + // case null: + // case JobState.Deleting: + // return new CombinedBatchTaskInfo + // { + // BatchTaskState = BatchTaskState.JobNotFound, + // FailureReason = BatchTaskState.JobNotFound.ToString(), + // Pool = azureBatchJobAndTaskState.Pool + // }; + // case JobState.Active: + // { + // if (azureBatchJobAndTaskState.NodeAllocationFailed) + // { + // return new CombinedBatchTaskInfo + // { + // BatchTaskState = BatchTaskState.NodeAllocationFailed, + // FailureReason = BatchTaskState.NodeAllocationFailed.ToString(), + // SystemLogItems = ConvertNodeErrorsToSystemLogItems(azureBatchJobAndTaskState), + // Pool = azureBatchJobAndTaskState.Pool + // }; + // } + + // if (azureBatchJobAndTaskState.NodeState == ComputeNodeState.Unusable) + // { + // return new CombinedBatchTaskInfo + // { + // BatchTaskState = BatchTaskState.NodeUnusable, + // FailureReason = BatchTaskState.NodeUnusable.ToString(), + // SystemLogItems = ConvertNodeErrorsToSystemLogItems(azureBatchJobAndTaskState), + // Pool = azureBatchJobAndTaskState.Pool + // }; + // } + + // if (azureBatchJobAndTaskState.NodeState == ComputeNodeState.Preempted) + // { + // return new CombinedBatchTaskInfo + // { + // BatchTaskState = BatchTaskState.NodePreempted, + // FailureReason = BatchTaskState.NodePreempted.ToString(), + // SystemLogItems = ConvertNodeErrorsToSystemLogItems(azureBatchJobAndTaskState), + // Pool = azureBatchJobAndTaskState.Pool + // }; + // } + + // if (azureBatchJobAndTaskState.NodeErrorCode is not null) + // { + // if (azureBatchJobAndTaskState.NodeErrorCode == TaskFailureInformationCodes.DiskFull) + // { + // return new CombinedBatchTaskInfo + // { + // BatchTaskState = BatchTaskState.NodeFailedDuringStartupOrExecution, + // FailureReason = azureBatchJobAndTaskState.NodeErrorCode, + // SystemLogItems = ConvertNodeErrorsToSystemLogItems(azureBatchJobAndTaskState), + // Pool = azureBatchJobAndTaskState.Pool + // }; + // } + // else + // { + // return new CombinedBatchTaskInfo + // { + // BatchTaskState = BatchTaskState.NodeFailedDuringStartupOrExecution, + // FailureReason = BatchTaskState.NodeFailedDuringStartupOrExecution.ToString(), + // SystemLogItems = ConvertNodeErrorsToSystemLogItems(azureBatchJobAndTaskState), + // Pool = azureBatchJobAndTaskState.Pool + // }; + // } + // } + + // break; + // } + // case JobState.Terminating: + // case JobState.Completed: + // break; + // default: + // throw new Exception($"Found batch job {tesTask.Id} in unexpected state: {azureBatchJobAndTaskState.JobState}"); + // } - return new CombinedBatchTaskInfo - { - BatchTaskState = BatchTaskState.CompletedWithErrors, - FailureReason = azureBatchJobAndTaskState.TaskFailureInformation?.Code, - BatchTaskExitCode = azureBatchJobAndTaskState.TaskExitCode, - BatchTaskStartTime = azureBatchJobAndTaskState.TaskStartTime, - BatchTaskEndTime = azureBatchJobAndTaskState.TaskEndTime, - SystemLogItems = Enumerable.Empty() - .Append($"Batch task ExitCode: {azureBatchJobAndTaskState.TaskExitCode}, Failure message: {azureBatchJobAndTaskState.TaskFailureInformation?.Message}") - .Concat(azureBatchJobAndTaskState.TaskFailureInformation?.Details?.Select(d => $"{d.Name}: {d.Value}") ?? Enumerable.Empty()), - Pool = azureBatchJobAndTaskState.Pool - }; - } - default: - throw new Exception($"Found batch task {tesTask.Id} in unexpected state: {azureBatchJobAndTaskState.TaskState}"); - } - } + // switch (azureBatchJobAndTaskState.TaskState) + // { + // case null: + // return new CombinedBatchTaskInfo + // { + // BatchTaskState = BatchTaskState.MissingBatchTask, + // FailureReason = BatchTaskState.MissingBatchTask.ToString(), + // Pool = azureBatchJobAndTaskState.Pool + // }; + // case TaskState.Active: + // case TaskState.Preparing: + // return new CombinedBatchTaskInfo + // { + // BatchTaskState = BatchTaskState.Initializing, + // Pool = azureBatchJobAndTaskState.Pool + // }; + // case TaskState.Running: + // return new CombinedBatchTaskInfo + // { + // BatchTaskState = BatchTaskState.Running, + // Pool = azureBatchJobAndTaskState.Pool + // }; + // case TaskState.Completed: + // if (azureBatchJobAndTaskState.TaskExitCode == 0 && azureBatchJobAndTaskState.TaskFailureInformation is null) + // { + // var metrics = await GetBatchNodeMetricsAndCromwellResultCodeAsync(tesTask, cancellationToken); + + // return new CombinedBatchTaskInfo + // { + // BatchTaskState = BatchTaskState.CompletedSuccessfully, + // BatchTaskExitCode = azureBatchJobAndTaskState.TaskExitCode, + // BatchTaskStartTime = metrics.TaskStartTime ?? azureBatchJobAndTaskState.TaskStartTime, + // BatchTaskEndTime = metrics.TaskEndTime ?? azureBatchJobAndTaskState.TaskEndTime, + // BatchNodeMetrics = metrics.BatchNodeMetrics, + // CromwellRcCode = metrics.CromwellRcCode, + // Pool = azureBatchJobAndTaskState.Pool + // }; + // } + // else + // { + // logger.LogError("Task {TesTask} failed. ExitCode: {TaskExitCode}, BatchJobInfo: {BatchJobInfo}", tesTask.Id, azureBatchJobAndTaskState.TaskExitCode, JsonConvert.SerializeObject(azureBatchJobAndTaskState)); + + // return new CombinedBatchTaskInfo + // { + // BatchTaskState = BatchTaskState.CompletedWithErrors, + // FailureReason = azureBatchJobAndTaskState.TaskFailureInformation?.Code, + // BatchTaskExitCode = azureBatchJobAndTaskState.TaskExitCode, + // BatchTaskStartTime = azureBatchJobAndTaskState.TaskStartTime, + // BatchTaskEndTime = azureBatchJobAndTaskState.TaskEndTime, + // SystemLogItems = Enumerable.Empty() + // .Append($"Batch task ExitCode: {azureBatchJobAndTaskState.TaskExitCode}, Failure message: {azureBatchJobAndTaskState.TaskFailureInformation?.Message}") + // .Concat(azureBatchJobAndTaskState.TaskFailureInformation?.Details?.Select(d => $"{d.Name}: {d.Value}") ?? Enumerable.Empty()), + // Pool = azureBatchJobAndTaskState.Pool + // }; + // } + // default: + // throw new Exception($"Found batch task {tesTask.Id} in unexpected state: {azureBatchJobAndTaskState.TaskState}"); + // } + //} ///// ///// Transitions the to the new state, based on the rules defined in the tesTaskStateTransitions list. diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 3f804ad4a..8348b45fb 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -137,9 +137,6 @@ public async Task DeleteBatchComputeNodesAsync(string poolId, IEnumerable public int GetBatchActivePoolCount() => cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetBatchActivePoolCount()); - /// - public Task GetBatchJobAndTaskStateAsync(Tes.Models.TesTask tesTask, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAndTaskStateAsync(tesTask, ct), cancellationToken); - /// public Task> GetPoolIdsReferencedByJobsAsync(CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(azureProxy.GetPoolIdsReferencedByJobsAsync, cancellationToken); diff --git a/src/TesApi.Web/IAzureProxy.cs b/src/TesApi.Web/IAzureProxy.cs index f30e30a3c..543b2183e 100644 --- a/src/TesApi.Web/IAzureProxy.cs +++ b/src/TesApi.Web/IAzureProxy.cs @@ -57,13 +57,7 @@ public interface IAzureProxy /// A for controlling the lifetime of the asynchronous operation. Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, bool isPreemptable, CancellationToken cancellationToken); - /// - /// Gets the combined state of Azure Batch job, task and pool that corresponds to the given TES task - /// - /// The TES task - /// A for controlling the lifetime of the asynchronous operation. - /// Job state information - Task GetBatchJobAndTaskStateAsync(TesTask tesTask, CancellationToken cancellationToken); + // TODO: we may need a query or two to replace what was here /// /// Deletes an Azure Batch job for Autopools diff --git a/src/TesApi.Web/IBatchPool.cs b/src/TesApi.Web/IBatchPool.cs index 97f56b475..6fb4b6ab5 100644 --- a/src/TesApi.Web/IBatchPool.cs +++ b/src/TesApi.Web/IBatchPool.cs @@ -2,6 +2,7 @@ // Licensed under the MIT License. using System; +using System.Collections.Generic; using System.Threading; using System.Threading.Tasks; using Microsoft.Azure.Batch; @@ -70,5 +71,12 @@ public interface IBatchPool /// /// ValueTask GetAllocationStateTransitionTime(CancellationToken cancellationToken = default); + + /// + /// Gets the completed tasks in this pool's associated job. + /// + /// + /// + IAsyncEnumerable GetCompletedTasks(CancellationToken cancellationToken); } } diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index 7c47f116b..2641d9190 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -47,10 +47,10 @@ public interface IBatchScheduler /// Finalize completed s on a batch system /// /// s to schedule on the batch system. - /// s corresponding to each . + /// s corresponding to each . /// A for controlling the lifetime of the asynchronous operation. /// True for each corresponding that needs to be persisted. - IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCompletedTesTasksAsync(IEnumerable tesTasks, CloudTask[] cloudTask, CancellationToken cancellationToken); + IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCompletedTesTasksAsync(IEnumerable tesTasks, CloudTask[] cloudTasks, CancellationToken cancellationToken); /// /// Terminate cancelled s on a batch system diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs new file mode 100644 index 000000000..674c5ea44 --- /dev/null +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs @@ -0,0 +1,272 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; +using Npgsql.Replication.PgOutput.Messages; +using Tes.Extensions; +using Tes.Models; +using Tes.Repository; + +namespace TesApi.Web +{ + /// + /// A background service template that schedules TES tasks in the batch system, orchestrates their lifecycle, and updates their state. + /// This should only be used to build system-wide singleton services. This class does not support scale-out on multiple machines, + /// nor does it implement a leasing mechanism. In the future, consider using the Lease Blob operation. + /// + internal abstract class OrchestrateOnBatchSchedulerService : BackgroundService + { + protected readonly IRepository repository; + protected readonly IBatchScheduler batchScheduler; + protected readonly ILogger logger; + + /// + /// Default constructor + /// + /// The main TES task database repository implementation + /// The batch scheduler implementation + /// The logger instance + protected OrchestrateOnBatchSchedulerService(IRepository repository, IBatchScheduler batchScheduler, ILogger logger) + { + this.repository = repository; + this.batchScheduler = batchScheduler; + this.logger = logger; + } + + /// + /// Prepends the log message with the ultimately derived class's name. + /// + /// + /// prepended with the class name. + protected string MarkLogMessage(string message) + { + return GetType().Name + " " + message; + } + + /// + [System.Diagnostics.CodeAnalysis.SuppressMessage("Usage", "CA2254:Template should be a static expression", Justification = "Used to provide service's name in log message.")] + public override Task StopAsync(CancellationToken cancellationToken) + { + logger.LogInformation(MarkLogMessage("stopping...")); + return base.StopAsync(cancellationToken); + } + + /// + [System.Diagnostics.CodeAnalysis.SuppressMessage("Usage", "CA2254:Template should be a static expression", Justification = "Used to provide service's name in log message.")] + protected sealed override async Task ExecuteAsync(CancellationToken stoppingToken) + { + // The order of these two calls is critical. + ExecuteSetup(stoppingToken); + await ExecuteSetupAsync(stoppingToken); + + logger.LogInformation(MarkLogMessage("started.")); + + await ExecuteCoreAsync(stoppingToken); + + logger.LogInformation(MarkLogMessage("gracefully stopped.")); + } + + /// + /// This method is called when the starts. The implementation should return a task that represents + /// the lifetime of the long running operation(s) being performed. + /// + /// Triggered when is called. + /// A that represents the long running operations. + /// See Worker Services in .NET for implementation guidelines. + protected abstract Task ExecuteCoreAsync(CancellationToken stoppingToken); + + /// + /// This method is called right before . It can be used to prepare the service or the system before the service's operations begin. + /// + /// Triggered when is called. + /// A that represents this method's operations. + protected virtual Task ExecuteSetupAsync(CancellationToken stoppingToken) => Task.CompletedTask; + + /// + /// This method is called right before . It can be used to prepare the service or the system before the service's operations begin. + /// + /// Triggered when is called. + /// This method's lifetime will delay the exit of in the base class, thus delaying the start of subsequent services in the system. + protected virtual void ExecuteSetup(CancellationToken stoppingToken) { } + + /// + /// Runs repeatedly at an interval of . + /// + /// Delay between runs of . + /// Action to repeatedly run. + /// Triggered when is called. + /// A System.Threading.Tasks.Task that represents the long running operations. + protected async Task ExecuteActionOnIntervalAsync(TimeSpan runInterval, Func action, CancellationToken stoppingToken) + { + ArgumentNullException.ThrowIfNull(action); + + while (!stoppingToken.IsCancellationRequested) + { + try + { + await action(stoppingToken); + await Task.Delay(runInterval, stoppingToken); + } + catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested) + { + break; + } + catch (Exception exc) + { + logger.LogError(exc, "{Message}", exc.Message); + } + } + } + + /// + /// Retrieves provided actionable TES tasks from the database using , performs an action in the batch system using , and updates the resultant state + /// + /// A System.Threading.Tasks.ValueTask that represents the long running operations. + protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func>> tesTaskGetter, Func IsModified)>> tesTaskProcessor, CancellationToken stoppingToken) + { + var pools = new HashSet(); + var tesTasks = await (await tesTaskGetter(stoppingToken)).ToArrayAsync(stoppingToken); + + if (0 == tesTasks.Length) + { + return; + } + + var startTime = DateTime.UtcNow; + + await foreach (var (tesTask, waitableResult) in tesTaskProcessor(tesTasks, stoppingToken).WithCancellation(stoppingToken)) + { + try + { + var isModified = false; + try + { + isModified = await waitableResult; + } + catch (Exception exc) + { + if (++tesTask.ErrorCount > 3) // TODO: Should we increment this for exceptions here (current behaviour) or the attempted executions on the batch? + { + tesTask.State = TesState.SYSTEMERROREnum; + tesTask.EndTime = DateTimeOffset.UtcNow; + tesTask.SetFailureReason("UnknownError", exc.Message, exc.StackTrace); + } + + if (exc is Microsoft.Azure.Batch.Common.BatchException batchException) + { + var requestInfo = batchException.RequestInformation; + //var requestId = batchException.RequestInformation?.ServiceRequestId; + var reason = (batchException.InnerException as Microsoft.Azure.Batch.Protocol.Models.BatchErrorException)?.Response?.ReasonPhrase; + var logs = new List(); + + if (requestInfo?.ServiceRequestId is not null) + { + logs.Add($"Azure batch ServiceRequestId: {requestInfo.ServiceRequestId}"); + } + + if (requestInfo?.BatchError is not null) + { + logs.Add($"BatchErrorCode: {requestInfo.BatchError.Code}"); + logs.Add($"BatchErrorMessage: {requestInfo.BatchError.Message}"); + foreach (var detail in requestInfo.BatchError.Values?.Select(d => $"{d.Key}={d.Value}") ?? Enumerable.Empty()) + { + logs.Add(detail); + } + } + + tesTask.AddToSystemLog(logs); + } + + logger.LogError(exc, "TES task: {TesTask} threw an exception in OrchestrateTesTasksOnBatch({Poll}).", tesTask.Id, pollName); + await repository.UpdateItemAsync(tesTask, stoppingToken); + } + + if (isModified) + { + var hasErrored = false; + var hasEnded = false; + + switch (tesTask.State) + { + case TesState.CANCELEDEnum: + case TesState.COMPLETEEnum: + hasEnded = true; + break; + + case TesState.EXECUTORERROREnum: + case TesState.SYSTEMERROREnum: + hasErrored = true; + hasEnded = true; + break; + + default: + break; + } + + if (hasEnded) + { + tesTask.EndTime = DateTimeOffset.UtcNow; + } + + if (hasErrored) + { + logger.LogDebug("{TesTask} failed, state: {TesTaskState}, reason: {TesTaskFailureReason}", tesTask.Id, tesTask.State, tesTask.FailureReason); + } + + await repository.UpdateItemAsync(tesTask, stoppingToken); + } + } + catch (RepositoryCollisionException exc) + { + logger.LogError(exc, "RepositoryCollisionException in OrchestrateTesTasksOnBatch({Poll})", pollName); + //TODO: retrieve fresh task if possible and add logs to the task in a similar way to the commanted out code block below. + //Also: consider doing the same in the other place(s) this exception is caught. + } + // TODO catch EF / postgres exception? + //catch (Microsoft.Azure.Cosmos.CosmosException exc) + //{ + // TesTask currentTesTask = default; + // _ = await repository.TryGetItemAsync(tesTask.Id, t => currentTesTask = t); + + // if (exc.StatusCode == System.Net.HttpStatusCode.PreconditionFailed) + // { + // logger.LogError(exc, $"Updating TES Task '{tesTask.Id}' threw an exception attempting to set state: {tesTask.State}. Another actor set state: {currentTesTask?.State}"); + // currentTesTask?.SetWarning("ConcurrencyWriteFailure", tesTask.State.ToString(), exc.Message, exc.StackTrace); + // } + // else + // { + // logger.LogError(exc, $"Updating TES Task '{tesTask.Id}' threw {exc.GetType().FullName}: '{exc.Message}'. Stack trace: {exc.StackTrace}"); + // currentTesTask?.SetWarning("UnknownError", exc.Message, exc.StackTrace); + // } + + // if (currentTesTask is not null) + // { + // await repository.UpdateItemAsync(currentTesTask); + // } + //} + catch (Exception exc) + { + logger.LogError(exc, "Updating TES Task '{TesTask}' threw {ExceptionType}: '{ExceptionMessage}'. Stack trace: {ExceptionStackTrace}", tesTask.Id, exc.GetType().FullName, exc.Message, exc.StackTrace); + } + + if (!string.IsNullOrWhiteSpace(tesTask.PoolId) && (TesState.QUEUEDEnum == tesTask.State || TesState.RUNNINGEnum == tesTask.State)) + { + pools.Add(tesTask.PoolId); + } + } + + if (batchScheduler.NeedPoolFlush) + { + await batchScheduler.FlushPoolsAsync(pools, stoppingToken); + } + + logger.LogDebug("OrchestrateTesTasksOnBatch({Poll}) for {TaskCount} tasks completed in {TotalSeconds} seconds.", pollName, tesTasks.Length, DateTime.UtcNow.Subtract(startTime).TotalSeconds); + } + } +} diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index 679908670..f1cac5709 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -6,11 +6,10 @@ using System.Linq; using System.Threading; using System.Threading.Tasks; -using Microsoft.Extensions.Hosting; using Microsoft.Extensions.Logging; -using Tes.Extensions; using Tes.Models; using Tes.Repository; +using YamlDotNet.Core.Tokens; namespace TesApi.Web { @@ -19,12 +18,9 @@ namespace TesApi.Web /// This should only be used as a system-wide singleton service. This class does not support scale-out on multiple machines, /// nor does it implement a leasing mechanism. In the future, consider using the Lease Blob operation. /// - public class Scheduler : BackgroundService + internal class Scheduler : OrchestrateOnBatchSchedulerService { - private readonly IRepository repository; - private readonly IBatchScheduler batchScheduler; - private readonly ILogger logger; - private readonly TimeSpan runInterval = TimeSpan.FromSeconds(5); + private readonly TimeSpan runInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs uses a 30 second polling interval /// /// Default constructor @@ -33,25 +29,15 @@ public class Scheduler : BackgroundService /// The batch scheduler implementation /// The logger instance public Scheduler(IRepository repository, IBatchScheduler batchScheduler, ILogger logger) - { - this.repository = repository; - this.batchScheduler = batchScheduler; - this.logger = logger; - } + : base(repository, batchScheduler, logger) { } - /// - public override Task StopAsync(CancellationToken cancellationToken) - { - logger.LogInformation("Scheduler stopping..."); - return base.StopAsync(cancellationToken); - } /// /// The main thread that continuously schedules TES tasks in the batch system /// /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. /// A System.Threading.Tasks.Task that represents the long running operations. - protected override async Task ExecuteAsync(CancellationToken stoppingToken) + protected override async Task ExecuteSetupAsync(CancellationToken stoppingToken) { try { @@ -63,28 +49,33 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) logger.LogError(exc, @"Checking/storing the node task runner binary failed with {Message}", exc.Message); throw; } + } - logger.LogInformation("Scheduler started."); - - await Task.WhenAll(ExecuteCancelledTesTasksOnBatch(stoppingToken), ExecuteQueuedTesTasksOnBatch(stoppingToken), ExecuteTerminatedTesTasksOnBatch(stoppingToken)); - - logger.LogInformation("Scheduler gracefully stopped."); + /// + /// The main thread that continuously schedules TES tasks in the batch system + /// + /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. + /// A System.Threading.Tasks.Task that represents the long running operations. + protected override Task ExecuteCoreAsync(CancellationToken stoppingToken) + { + return Task.WhenAll(ExecuteCancelledTesTasksOnBatchAsync(stoppingToken), ExecuteQueuedTesTasksOnBatchAsync(stoppingToken), ExecuteTerminatedTesTasksOnBatchAsync(stoppingToken)); } /// /// Retrieves all queued TES tasks from the database, performs an action in the batch system, and updates the resultant state /// /// - private async Task ExecuteQueuedTesTasksOnBatch(CancellationToken stoppingToken) + private Task ExecuteQueuedTesTasksOnBatchAsync(CancellationToken stoppingToken) { - await ExecuteTesTasksOnBatch( - async () => (await repository.GetItemsAsync( + var query = new Func>>( + async cancellationToken => (await repository.GetItemsAsync( predicate: t => t.State == TesState.QUEUEDEnum, - cancellationToken: stoppingToken)) - .OrderBy(t => t.CreationTime) - .ToAsyncEnumerable(), + cancellationToken: cancellationToken)) + .OrderBy(t => t.CreationTime) + .ToAsyncEnumerable()); - tasks => batchScheduler.ProcessQueuedTesTasksAsync(tasks, stoppingToken), + return ExecuteActionOnIntervalAsync(runInterval, + cancellationToken => OrchestrateTesTasksOnBatchAsync("Queued", query, batchScheduler.ProcessQueuedTesTasksAsync, cancellationToken), stoppingToken); } @@ -92,16 +83,17 @@ await ExecuteTesTasksOnBatch( /// Retrieves all cancelled TES tasks from the database, performs an action in the batch system, and updates the resultant state /// /// - private async Task ExecuteCancelledTesTasksOnBatch(CancellationToken stoppingToken) + private Task ExecuteCancelledTesTasksOnBatchAsync(CancellationToken stoppingToken) { - await ExecuteTesTasksOnBatch( - async () => (await repository.GetItemsAsync( + var query = new Func>>( + async cancellationToken => (await repository.GetItemsAsync( predicate: t => t.State == TesState.CANCELINGEnum, - cancellationToken: stoppingToken)) - .OrderBy(t => t.CreationTime) - .ToAsyncEnumerable(), + cancellationToken: cancellationToken)) + .OrderBy(t => t.CreationTime) + .ToAsyncEnumerable()); - tasks => batchScheduler.ProcessCancelledTesTasksAsync(tasks, stoppingToken), + return ExecuteActionOnIntervalAsync(runInterval, + cancellationToken => OrchestrateTesTasksOnBatchAsync("Cancelled", query, batchScheduler.ProcessCancelledTesTasksAsync, cancellationToken), stoppingToken); } @@ -109,194 +101,18 @@ await ExecuteTesTasksOnBatch( /// Retrieves all terminated TES tasks from the database, performs an action in the batch system, and updates the resultant state /// /// - private async Task ExecuteTerminatedTesTasksOnBatch(CancellationToken stoppingToken) + private Task ExecuteTerminatedTesTasksOnBatchAsync(CancellationToken stoppingToken) { - await ExecuteTesTasksOnBatch( - async () => (await repository.GetItemsAsync( + var query = new Func>>( + async cancellationToken => (await repository.GetItemsAsync( predicate: t => t.IsTaskDeletionRequired, - cancellationToken: stoppingToken)) - .OrderBy(t => t.CreationTime) - .ToAsyncEnumerable(), + cancellationToken: cancellationToken)) + .OrderBy(t => t.CreationTime) + .ToAsyncEnumerable()); - tasks => batchScheduler.ProcessTerminatedTesTasksAsync(tasks, stoppingToken), + return ExecuteActionOnIntervalAsync(runInterval, + cancellationToken => OrchestrateTesTasksOnBatchAsync("Terminated", query, batchScheduler.ProcessTerminatedTesTasksAsync, cancellationToken), stoppingToken); } - - /// - /// Retrieves provided actionable TES tasks from the database, performs an action in the batch system, and updates the resultant state - /// - /// - private async ValueTask ExecuteTesTasksOnBatch(Func>> tesTaskGetter, Func IsModified)>> tesTaskProcessor, CancellationToken stoppingToken) - { - while (!stoppingToken.IsCancellationRequested) - { - try - { - await OrchestrateTesTasksOnBatch(tesTaskGetter, tesTaskProcessor, stoppingToken); - } - catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested) - { - break; - } - catch (Exception exc) - { - logger.LogError(exc, exc.Message); - } - - try - { - await Task.Delay(runInterval, stoppingToken); - } - catch (TaskCanceledException) - { - break; - } - } - } - - /// - /// Retrieves provided actionable TES tasks from the database, performs an action in the batch system, and updates the resultant state - /// - /// - private async ValueTask OrchestrateTesTasksOnBatch(Func>> tesTaskGetter, Func IsModified)>> tesTaskProcessor, CancellationToken stoppingToken) - { - var pools = new HashSet(); - var tesTasks = await (await tesTaskGetter()).ToArrayAsync(stoppingToken); - - if (0 == tesTasks.Length) - { - return; - } - - var startTime = DateTime.UtcNow; - - await foreach (var (tesTask, waitableResult) in tesTaskProcessor(tesTasks).WithCancellation(stoppingToken)) - { - try - { - var isModified = false; - try - { - isModified = await waitableResult; - } - catch (Exception exc) - { - if (++tesTask.ErrorCount > 3) // TODO: Should we increment this for exceptions here (current behaviour) or the attempted executions on the batch? - { - tesTask.State = TesState.SYSTEMERROREnum; - tesTask.EndTime = DateTimeOffset.UtcNow; - tesTask.SetFailureReason("UnknownError", exc.Message, exc.StackTrace); - } - - if (exc is Microsoft.Azure.Batch.Common.BatchException batchException) - { - var requestInfo = batchException.RequestInformation; - //var requestId = batchException.RequestInformation?.ServiceRequestId; - var reason = (batchException.InnerException as Microsoft.Azure.Batch.Protocol.Models.BatchErrorException)?.Response?.ReasonPhrase; - var logs = new List(); - - if (requestInfo?.ServiceRequestId is not null) - { - logs.Add($"Azure batch ServiceRequestId: {requestInfo.ServiceRequestId}"); - } - - if (requestInfo?.BatchError is not null) - { - logs.Add($"BatchErrorCode: {requestInfo.BatchError.Code}"); - logs.Add($"BatchErrorMessage: {requestInfo.BatchError.Message}"); - foreach (var detail in requestInfo.BatchError.Values?.Select(d => $"{d.Key}={d.Value}") ?? Enumerable.Empty()) - { - logs.Add(detail); - } - } - - tesTask.AddToSystemLog(logs); - } - - logger.LogError(exc, "TES task: {TesTask} threw an exception in OrchestrateTesTasksOnBatch().", tesTask.Id); - await repository.UpdateItemAsync(tesTask, stoppingToken); - } - - if (isModified) - { - var hasErrored = false; - var hasEnded = false; - - switch (tesTask.State) - { - case TesState.CANCELEDEnum: - case TesState.COMPLETEEnum: - hasEnded = true; - break; - - case TesState.EXECUTORERROREnum: - case TesState.SYSTEMERROREnum: - hasErrored = true; - hasEnded = true; - break; - - default: - break; - } - - if (hasEnded) - { - tesTask.EndTime = DateTimeOffset.UtcNow; - } - - if (hasErrored) - { - logger.LogDebug("{TesTask} failed, state: {TesTaskState}, reason: {TesTaskFailureReason}", tesTask.Id, tesTask.State, tesTask.FailureReason); - } - - await repository.UpdateItemAsync(tesTask, stoppingToken); - } - } - catch (RepositoryCollisionException exc) - { - logger.LogError(exc, $"RepositoryCollisionException in OrchestrateTesTasksOnBatch"); - //TODO: retrieve fresh task if possible and add logs to the task in a similar way to the commanted out code block below. - //Also: consider doing the same in the other place(s) this exception is caught. - } - // TODO catch EF / postgres exception? - //catch (Microsoft.Azure.Cosmos.CosmosException exc) - //{ - // TesTask currentTesTask = default; - // _ = await repository.TryGetItemAsync(tesTask.Id, t => currentTesTask = t); - - // if (exc.StatusCode == System.Net.HttpStatusCode.PreconditionFailed) - // { - // logger.LogError(exc, $"Updating TES Task '{tesTask.Id}' threw an exception attempting to set state: {tesTask.State}. Another actor set state: {currentTesTask?.State}"); - // currentTesTask?.SetWarning("ConcurrencyWriteFailure", tesTask.State.ToString(), exc.Message, exc.StackTrace); - // } - // else - // { - // logger.LogError(exc, $"Updating TES Task '{tesTask.Id}' threw {exc.GetType().FullName}: '{exc.Message}'. Stack trace: {exc.StackTrace}"); - // currentTesTask?.SetWarning("UnknownError", exc.Message, exc.StackTrace); - // } - - // if (currentTesTask is not null) - // { - // await repository.UpdateItemAsync(currentTesTask); - // } - //} - catch (Exception exc) - { - logger.LogError(exc, "Updating TES Task '{TesTask}' threw {ExceptionType}: '{ExceptionMessage}'. Stack trace: {ExceptionStackTrace}", tesTask.Id, exc.GetType().FullName, exc.Message, exc.StackTrace); - } - - if (!string.IsNullOrWhiteSpace(tesTask.PoolId) && (TesState.QUEUEDEnum == tesTask.State || TesState.RUNNINGEnum == tesTask.State)) - { - pools.Add(tesTask.PoolId); - } - } - - if (batchScheduler.NeedPoolFlush) - { - await batchScheduler.FlushPoolsAsync(pools, stoppingToken); - } - - logger.LogDebug("OrchestrateTesTasksOnBatch for {TaskCount} tasks completed in {TotalSeconds} seconds.", tesTasks.Length, DateTime.UtcNow.Subtract(startTime).TotalSeconds); - } } } From b2eaa9d5a26a4b52457d52d462ad329fc2e7aeec Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 29 Sep 2023 12:34:20 -0700 Subject: [PATCH 008/202] Wire in initial pool size --- src/TesApi.Web/BatchScheduler.cs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index a6d48e081..9b851ef85 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -639,6 +639,7 @@ private bool TryGetCromwellTmpFilePath(string fileUri, out string localPath) vmSize: virtualMachineInfo.VmSize, autoscaled: true, preemptable: virtualMachineInfo.LowPriority, + neededPoolCounts[poolKey], nodeInfo: useGen2.GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo, containerConfiguration: containerMetadata.ContainerConfiguration, encryptionAtHostSupported: virtualMachineInfo.EncryptionAtHostSupported, @@ -1577,13 +1578,14 @@ private static BatchModels.BatchPoolIdentity GetBatchPoolIdentity(string[] ident /// /// /// + /// /// /// /// VM supports encryption at host. /// A for controlling the lifetime of the asynchronous operation. /// /// We use the PoolSpecification for both the namespace of all the constituent parts and for the fact that it allows us to configure shared and autopools using the same code. - private async ValueTask GetPoolSpecification(string vmSize, bool autoscaled, bool preemptable, BatchNodeInfo nodeInfo, ContainerConfiguration containerConfiguration, bool encryptionAtHostSupported, CancellationToken cancellationToken) + private async ValueTask GetPoolSpecification(string vmSize, bool autoscaled, bool preemptable, int initialTarget, BatchNodeInfo nodeInfo, ContainerConfiguration containerConfiguration, bool encryptionAtHostSupported, CancellationToken cancellationToken) { // Any changes to any properties set in this method will require corresponding changes to ConvertPoolSpecificationToModelsPool() @@ -1618,13 +1620,13 @@ private async ValueTask GetPoolSpecification(string vmSize, b { poolSpecification.AutoScaleEnabled = true; poolSpecification.AutoScaleEvaluationInterval = BatchPool.AutoScaleEvaluationInterval; - poolSpecification.AutoScaleFormula = BatchPool.AutoPoolFormula(preemptable, 1); + poolSpecification.AutoScaleFormula = BatchPool.AutoPoolFormula(preemptable, initialTarget); } else { poolSpecification.AutoScaleEnabled = false; - poolSpecification.TargetLowPriorityComputeNodes = preemptable == true ? 1 : 0; - poolSpecification.TargetDedicatedComputeNodes = preemptable == false ? 1 : 0; + poolSpecification.TargetLowPriorityComputeNodes = preemptable == true ? initialTarget : 0; + poolSpecification.TargetDedicatedComputeNodes = preemptable == false ? initialTarget : 0; } if (!string.IsNullOrEmpty(batchNodesSubnetId)) From f3e676b37dc5b35940139ef579fa4856fc32ba56 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 29 Sep 2023 14:21:00 -0700 Subject: [PATCH 009/202] Address quota issues arising from batching queued tasks instead of one-at-a-time --- src/TesApi.Tests/BatchSchedulerTests.cs | 8 +++ src/TesApi.Web/BatchScheduler.cs | 53 +++++++++++++++---- .../Management/ArmBatchQuotaProvider.cs | 7 +++ .../Management/BatchQuotaVerifier.cs | 32 ++++++++--- .../Management/IBatchQuotaProvider.cs | 7 +++ .../Management/IBatchQuotaVerifier.cs | 9 ++++ .../Management/TerraQuotaProvider.cs | 7 +++ 7 files changed, 106 insertions(+), 17 deletions(-) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 154b3cc14..8a2874a25 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -1942,6 +1942,9 @@ private class TestBatchQuotaVerifierQuotaMaxedOut : TestBatchQuotaVerifierBase { public TestBatchQuotaVerifierQuotaMaxedOut(IBatchQuotaProvider batchQuotaProvider) : base(batchQuotaProvider) { } + public override Task<(int exceeded, Exception exception)> CheckBatchAccountPoolOrJobQuotasAsync(int required, CancellationToken cancellationToken) + => throw new AzureBatchQuotaMaxedOutException("Test AzureBatchQuotaMaxedOutException"); + public override Task CheckBatchAccountQuotasAsync(VirtualMachineInformation _1, bool _2, System.Threading.CancellationToken cancellationToken) => throw new AzureBatchQuotaMaxedOutException("Test AzureBatchQuotaMaxedOutException"); } @@ -1950,6 +1953,9 @@ private class TestBatchQuotaVerifierLowQuota : TestBatchQuotaVerifierBase { public TestBatchQuotaVerifierLowQuota(IBatchQuotaProvider batchQuotaProvider) : base(batchQuotaProvider) { } + public override Task<(int exceeded, Exception exception)> CheckBatchAccountPoolOrJobQuotasAsync(int required, CancellationToken cancellationToken) + => throw new NotSupportedException(); + public override Task CheckBatchAccountQuotasAsync(VirtualMachineInformation _1, bool _2, System.Threading.CancellationToken cancellationToken) => throw new AzureBatchLowQuotaException("Test AzureBatchLowQuotaException"); } @@ -1961,6 +1967,8 @@ private abstract class TestBatchQuotaVerifierBase : IBatchQuotaVerifier protected TestBatchQuotaVerifierBase(IBatchQuotaProvider batchQuotaProvider) => this.batchQuotaProvider = batchQuotaProvider; + public abstract Task<(int exceeded, Exception exception)> CheckBatchAccountPoolOrJobQuotasAsync(int required, CancellationToken cancellationToken); + public abstract Task CheckBatchAccountQuotasAsync(VirtualMachineInformation virtualMachineInformation, bool needPoolOrJobQuotaCheck, System.Threading.CancellationToken cancellationToken); public IBatchQuotaProvider GetBatchQuotaProvider() diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 9b851ef85..da3e99c49 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -563,11 +563,11 @@ private bool TryGetCromwellTmpFilePath(string fileUri, out string localPath) /// public async IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { - var taskMetadataByPools = new Dictionary>(); - var poolKeyByTasks = new Dictionary(); + var taskMetadataListByPoolKey = new Dictionary>(); + var poolKeyByTaskIds = new Dictionary(); var tasks = tesTasks.ToList(); - // Determe how many nodes in each pool we might need for this group. + // Determine how many nodes in each pool we might need for this group. foreach (var tesTask in tesTasks) { Task quickResult = default; @@ -580,16 +580,16 @@ private bool TryGetCromwellTmpFilePath(string fileUri, out string localPath) (poolKey, var displayName) = GetPoolKey(tesTask, virtualMachineInfo, containerMetadata.ContainerConfiguration, cancellationToken); await quotaVerifier.CheckBatchAccountQuotasAsync(virtualMachineInfo, needPoolOrJobQuotaCheck: !IsPoolAvailable(poolKey), cancellationToken: cancellationToken); - if (taskMetadataByPools.TryGetValue(poolKey, out var resource)) + if (taskMetadataListByPoolKey.TryGetValue(poolKey, out var resource)) { resource.Add((tesTask, virtualMachineInfo, containerMetadata, displayName)); } else { - taskMetadataByPools.Add(poolKey, new() { (tesTask, virtualMachineInfo, containerMetadata, displayName) }); + taskMetadataListByPoolKey.Add(poolKey, new() { (tesTask, virtualMachineInfo, containerMetadata, displayName) }); } - poolKeyByTasks.Add(tesTask.Id, poolKey); + poolKeyByTaskIds.Add(tesTask.Id, poolKey); } catch (Exception ex) { @@ -603,12 +603,43 @@ private bool TryGetCromwellTmpFilePath(string fileUri, out string localPath) } } - var neededPoolCounts = taskMetadataByPools.ToDictionary(t => t.Key, t => t.Value.Count); + // Remove already returned tasks from the dictionary + taskMetadataListByPoolKey = taskMetadataListByPoolKey + .Select(p => (p.Key, Value: p.Value.Where(v => tasks.Contains(v.TesTask)).ToList())) // keep only tasks that remain in the tasks variable + .Where(t => t.Value.Count != 0) // Remove any empty pool keys + .ToDictionary(p => p.Key, p => p.Value); - foreach (var (tesTask, virtualMachineInfo, containerMetadata, displayName) in taskMetadataByPools.Values.SelectMany(e => e).Where(m => tasks.Contains(m.TesTask))) + // Determine how many nodes in each new pool we might need for this group. + var neededPoolNodesByPoolKey = taskMetadataListByPoolKey.ToDictionary(t => t.Key, t => t.Value.Count); + + // Determine how many new pools/jobs we will need for this batch + var requiredNewPools = neededPoolNodesByPoolKey.Where(t => !IsPoolAvailable(t.Key)).Sum(t => t.Value); + + // Revisit pool/job quotas (the above loop already dealt with the possiblility of needing just one more pool/job) + if (requiredNewPools > 1) + { + var (excess, exception) = await quotaVerifier.CheckBatchAccountPoolOrJobQuotasAsync(requiredNewPools, cancellationToken); + var initial = taskMetadataListByPoolKey.Count - 1; + var final = initial - excess; + + for (var i = initial; i > final; --i) + { + var key = taskMetadataListByPoolKey.Keys.ElementAt(i); + if (taskMetadataListByPoolKey.Remove(key, out var listOfTaskMetadata)) + { + foreach (var (task, _, _, _) in listOfTaskMetadata) + { + yield return (task, Task.FromException(exception)); + } + } + } + } + + // Obtain assigned pool and create and assign the cloudtask for each task. + foreach (var (tesTask, virtualMachineInfo, containerMetadata, displayName) in taskMetadataListByPoolKey.Values.SelectMany(e => e)) { Task quickResult = default; - var poolKey = poolKeyByTasks[tesTask.Id]; + var poolKey = poolKeyByTaskIds[tesTask.Id]; try { @@ -639,7 +670,7 @@ private bool TryGetCromwellTmpFilePath(string fileUri, out string localPath) vmSize: virtualMachineInfo.VmSize, autoscaled: true, preemptable: virtualMachineInfo.LowPriority, - neededPoolCounts[poolKey], + neededPoolNodesByPoolKey[poolKey], nodeInfo: useGen2.GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo, containerConfiguration: containerMetadata.ContainerConfiguration, encryptionAtHostSupported: virtualMachineInfo.EncryptionAtHostSupported, @@ -651,7 +682,7 @@ private bool TryGetCromwellTmpFilePath(string fileUri, out string localPath) tesTask.PoolId = poolInformation.PoolId; var cloudTask = await ConvertTesTaskToBatchTaskAsync(jobOrTaskId, tesTask, containerMetadata.IsPublic, cancellationToken); - logger.LogInformation($"Creating batch task for TES task {tesTask.Id}. Using VM size {virtualMachineInfo.VmSize}."); + logger.LogInformation(@"Creating batch task for TES task {TesTaskId}. Using VM size {VmSize}.", tesTask.Id, virtualMachineInfo.VmSize); await azureProxy.AddBatchTaskAsync(tesTask.Id, cloudTask, poolInformation, cancellationToken); tesTaskLog.StartTime = DateTimeOffset.UtcNow; diff --git a/src/TesApi.Web/Management/ArmBatchQuotaProvider.cs b/src/TesApi.Web/Management/ArmBatchQuotaProvider.cs index 40066eecb..6c4015642 100644 --- a/src/TesApi.Web/Management/ArmBatchQuotaProvider.cs +++ b/src/TesApi.Web/Management/ArmBatchQuotaProvider.cs @@ -74,6 +74,13 @@ public async Task GetVmCoreQuotaAsync(bool lowPriority, Cancel new AccountQuota(batchQuota.ActiveJobAndJobScheduleQuota, batchQuota.PoolQuota, batchQuota.DedicatedCoreQuota, batchQuota.LowPriorityCoreQuota)); } + /// + public async Task<(int PoolQuota, int ActiveJobAndJobScheduleQuota)> GetPoolOrJobQuotaAsync(CancellationToken cancellationToken) + { + var quotas = await GetBatchAccountQuotasAsync(cancellationToken); + return (quotas.PoolQuota, quotas.ActiveJobAndJobScheduleQuota); + } + /// /// Getting the batch account quota. /// diff --git a/src/TesApi.Web/Management/BatchQuotaVerifier.cs b/src/TesApi.Web/Management/BatchQuotaVerifier.cs index 604c622eb..2d96ce5aa 100644 --- a/src/TesApi.Web/Management/BatchQuotaVerifier.cs +++ b/src/TesApi.Web/Management/BatchQuotaVerifier.cs @@ -6,6 +6,7 @@ using System.Threading; using System.Threading.Tasks; using Microsoft.Extensions.Logging; +using Newtonsoft.Json; using Tes.Models; using TesApi.Web.Management.Models.Quotas; @@ -106,18 +107,37 @@ public async Task CheckBatchAccountQuotasAsync(VirtualMachineInformation virtual if (needPoolOrJobQuotaCheck) { - if (batchUtilization.ActiveJobsCount + 1 > batchVmFamilyBatchQuotas.ActiveJobAndJobScheduleQuota) - { - throw new AzureBatchQuotaMaxedOutException($"No remaining active jobs quota available. There are {batchUtilization.ActivePoolsCount} active jobs out of {batchVmFamilyBatchQuotas.ActiveJobAndJobScheduleQuota}."); - } + var (_, exception) = CheckBatchAccountPoolOrJobQuotasImpl(required: 1, batchUtilization.ActiveJobsCount, batchUtilization.ActivePoolsCount, batchVmFamilyBatchQuotas.ActiveJobAndJobScheduleQuota, batchVmFamilyBatchQuotas.PoolQuota); - if (batchUtilization.ActivePoolsCount + 1 > batchVmFamilyBatchQuotas.PoolQuota) + if (exception is not null) { - throw new AzureBatchQuotaMaxedOutException($"No remaining pool quota available. There are {batchUtilization.ActivePoolsCount} pools in use out of {batchVmFamilyBatchQuotas.PoolQuota}."); + throw exception; } } } + /// + public async Task<(int exceeded, Exception exception)> CheckBatchAccountPoolOrJobQuotasAsync(int required, CancellationToken cancellationToken) + { + var (poolQuota, activeJobAndJobScheduleQuota) = await batchQuotaProvider.GetPoolOrJobQuotaAsync(cancellationToken); + return CheckBatchAccountPoolOrJobQuotasImpl(required, azureProxy.GetBatchActiveJobCount(), azureProxy.GetBatchActivePoolCount(), activeJobAndJobScheduleQuota, poolQuota); + } + + private static (int exceeded, Exception exception) CheckBatchAccountPoolOrJobQuotasImpl(int required, int activeJobsCount, int activePoolsCount, int activeJobAndJobScheduleQuota, int poolQuota) + { + if (activeJobsCount + required > activeJobAndJobScheduleQuota) + { + return (activeJobsCount + required - activeJobAndJobScheduleQuota, new AzureBatchQuotaMaxedOutException($"No remaining active jobs quota available. There are {activePoolsCount} active jobs out of {activeJobAndJobScheduleQuota}.")); + } + + if (activePoolsCount + required > poolQuota) + { + return (activePoolsCount + required - poolQuota, new AzureBatchQuotaMaxedOutException($"No remaining pool quota available. There are {activePoolsCount} pools in use out of {poolQuota}.")); + } + + return (0, null); + } + /// public IBatchQuotaProvider GetBatchQuotaProvider() => batchQuotaProvider; diff --git a/src/TesApi.Web/Management/IBatchQuotaProvider.cs b/src/TesApi.Web/Management/IBatchQuotaProvider.cs index 0ee2ccffc..33c24e878 100644 --- a/src/TesApi.Web/Management/IBatchQuotaProvider.cs +++ b/src/TesApi.Web/Management/IBatchQuotaProvider.cs @@ -33,5 +33,12 @@ Task GetQuotaForRequirementAsync( /// A for controlling the lifetime of the asynchronous operation. /// Task GetVmCoreQuotaAsync(bool lowPriority, CancellationToken cancellationToken); + + /// + /// Returns the Pool and Active Job and JobSchedule quotas. + /// + /// A for controlling the lifetime of the asynchronous operation. + /// + Task<(int PoolQuota, int ActiveJobAndJobScheduleQuota)> GetPoolOrJobQuotaAsync(CancellationToken cancellationToken); } } diff --git a/src/TesApi.Web/Management/IBatchQuotaVerifier.cs b/src/TesApi.Web/Management/IBatchQuotaVerifier.cs index 0c72e0642..8e9e48e56 100644 --- a/src/TesApi.Web/Management/IBatchQuotaVerifier.cs +++ b/src/TesApi.Web/Management/IBatchQuotaVerifier.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +using System; using System.Threading; using System.Threading.Tasks; using Tes.Models; @@ -23,6 +24,14 @@ public interface IBatchQuotaVerifier /// Thrown when a max quota condition was identified Task CheckBatchAccountQuotasAsync(VirtualMachineInformation virtualMachineInformation, bool needPoolOrJobQuotaCheck, CancellationToken cancellationToken); + /// + /// Checks if the current quota allows creation of the requested quantity of new Pools and Jobs. + /// + /// + /// A for controlling the lifetime of the asynchronous operation. + /// The size of the portion of that would've resulted in the returned . + Task<(int exceeded, Exception exception)> CheckBatchAccountPoolOrJobQuotasAsync(int required, CancellationToken cancellationToken); + /// /// Gets the instance of the batch quota provider. /// diff --git a/src/TesApi.Web/Management/TerraQuotaProvider.cs b/src/TesApi.Web/Management/TerraQuotaProvider.cs index 862d031a0..d0e98ccd8 100644 --- a/src/TesApi.Web/Management/TerraQuotaProvider.cs +++ b/src/TesApi.Web/Management/TerraQuotaProvider.cs @@ -85,6 +85,13 @@ public async Task GetVmCoreQuotaAsync(bool lowPriority, Cancel batchQuota.QuotaValues.LowPriorityCoreQuota)); } + /// + public async Task<(int PoolQuota, int ActiveJobAndJobScheduleQuota)> GetPoolOrJobQuotaAsync(CancellationToken cancellationToken) + { + var quotas = await GetBatchAccountQuotaFromTerraAsync(cancellationToken); + return (quotas.QuotaValues.PoolQuota, quotas.QuotaValues.ActiveJobAndJobScheduleQuota); + } + private async Task GetBatchAccountQuotaFromTerraAsync(CancellationToken cancellationToken) { var batchResourceId = await GetBatchAccountResourceIdFromLandingZone(cancellationToken); From 64dde8174302c0c5a6e35988b9c6c86e68518bb5 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 2 Oct 2023 10:42:39 -0700 Subject: [PATCH 010/202] Change delay to interval --- .../OrchestrateOnBatchSchedulerService.cs | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs index 674c5ea44..8cbb464af 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs @@ -98,7 +98,7 @@ protected virtual void ExecuteSetup(CancellationToken stoppingToken) { } /// /// Runs repeatedly at an interval of . /// - /// Delay between runs of . + /// Interval to rerun . /// Action to repeatedly run. /// Triggered when is called. /// A System.Threading.Tasks.Task that represents the long running operations. @@ -106,22 +106,29 @@ protected async Task ExecuteActionOnIntervalAsync(TimeSpan runInterval, Func From 03465ea9471e30afc398fae1cc27e9b2977626d1 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 2 Oct 2023 11:34:38 -0700 Subject: [PATCH 011/202] Stub in processing node events --- src/TesApi.Web/Scheduler.cs | 40 ++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index f1cac5709..3da48eb6f 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -58,7 +58,11 @@ protected override async Task ExecuteSetupAsync(CancellationToken stoppingToken) /// A System.Threading.Tasks.Task that represents the long running operations. protected override Task ExecuteCoreAsync(CancellationToken stoppingToken) { - return Task.WhenAll(ExecuteCancelledTesTasksOnBatchAsync(stoppingToken), ExecuteQueuedTesTasksOnBatchAsync(stoppingToken), ExecuteTerminatedTesTasksOnBatchAsync(stoppingToken)); + return Task.WhenAll( + ExecuteCancelledTesTasksOnBatchAsync(stoppingToken), + ExecuteQueuedTesTasksOnBatchAsync(stoppingToken), + ExecuteTerminatedTesTasksOnBatchAsync(stoppingToken), + ExecuteUpdateTesTaskFromEventBlobAsync(stoppingToken)); } /// @@ -82,6 +86,7 @@ private Task ExecuteQueuedTesTasksOnBatchAsync(CancellationToken stoppingToken) /// /// Retrieves all cancelled TES tasks from the database, performs an action in the batch system, and updates the resultant state /// + /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. /// private Task ExecuteCancelledTesTasksOnBatchAsync(CancellationToken stoppingToken) { @@ -100,6 +105,7 @@ private Task ExecuteCancelledTesTasksOnBatchAsync(CancellationToken stoppingToke /// /// Retrieves all terminated TES tasks from the database, performs an action in the batch system, and updates the resultant state /// + /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. /// private Task ExecuteTerminatedTesTasksOnBatchAsync(CancellationToken stoppingToken) { @@ -114,5 +120,37 @@ private Task ExecuteTerminatedTesTasksOnBatchAsync(CancellationToken stoppingTok cancellationToken => OrchestrateTesTasksOnBatchAsync("Terminated", query, batchScheduler.ProcessTerminatedTesTasksAsync, cancellationToken), stoppingToken); } + + /// + /// Retrieves all event blobs from storage and updates the resultant state. + /// + /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. + /// + private Task ExecuteUpdateTesTaskFromEventBlobAsync(CancellationToken stoppingToken) + { + return ExecuteActionOnIntervalAsync(runInterval, + UpdateTesTasksFromEventBlobsAsync, + stoppingToken); + } + + // TODO: Implement this + /// + /// Retrieves all event blobs from storage and updates the resultant state. + /// + /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. + /// + ValueTask UpdateTesTasksFromEventBlobsAsync(CancellationToken stoppingToken) + { + return ValueTask.CompletedTask; + + //// Get and parse event blobs + + //// Get TesTask for each blob + + //// Update TesTasks + //await OrchestrateTesTasksOnBatchAsync("NodeEvent", query, (tasks, cancellationToken) => batchScheduler.MethodToBeWrittenAsync(tasks, events, cancellationToken), stoppingToken); + + //// Delete blobs + } } } From 81bc5843d9445d2efb308ca19242ef02b7a6675e Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 2 Oct 2023 17:53:49 -0700 Subject: [PATCH 012/202] WIP: Start to restore required error handling --- src/TesApi.Web/AzureBatchTaskState.cs | 88 +++ src/TesApi.Web/BatchPoolService.cs | 25 +- src/TesApi.Web/BatchScheduler.cs | 526 +++++++++--------- src/TesApi.Web/BatchTaskState.cs | 81 --- .../SystemThreadingTaskExtensions.cs | 82 +++ src/TesApi.Web/IBatchScheduler.cs | 17 +- .../OrchestrateOnBatchSchedulerService.cs | 2 +- 7 files changed, 475 insertions(+), 346 deletions(-) create mode 100644 src/TesApi.Web/AzureBatchTaskState.cs delete mode 100644 src/TesApi.Web/BatchTaskState.cs create mode 100644 src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs diff --git a/src/TesApi.Web/AzureBatchTaskState.cs b/src/TesApi.Web/AzureBatchTaskState.cs new file mode 100644 index 000000000..91dc69a4f --- /dev/null +++ b/src/TesApi.Web/AzureBatchTaskState.cs @@ -0,0 +1,88 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System; +using System.Collections.Generic; +using static TesApi.Web.AzureBatchTaskState; + +namespace TesApi.Web +{ + /// + /// Combined state of an attempt to run a + /// + /// Task state. See . + /// File details after the task has completed successfully, for logging purposes. + /// Failure information. + /// . + /// . + /// . + /// . + public record AzureBatchTaskState(TaskState State, IEnumerable OutputFileLogs = default, FailureInformation Failure = default, DateTimeOffset? CloudTaskCreationTime = default, DateTimeOffset? BatchTaskStartTime = default, DateTimeOffset? BatchTaskEndTime = default, int? BatchTaskExitCode = default) + { + /// + /// TesTask's state + /// + public enum TaskState + { + /// + /// The task has been assigned to a compute node, but is waiting for a + /// required Job Preparation task to complete on the node. + /// + Initializing, + + /// + /// The task is running on a compute node. + /// + Running, + + /// + /// The task is no longer eligible to run, usually because the task has + /// finished successfully, or the task has finished unsuccessfully and + /// has exhausted its retry limit. A task is also marked as completed + /// if an error occurred launching the task, or when the task has been + /// terminated. + /// + CompletedSuccessfully, + + /// + /// The task has completed, but it finished unsuccessfully or the executor had an error + /// + CompletedWithErrors, + + /// + /// Azure Batch was unable to allocate a machine for the job. This could be due to either a temporary or permanent unavailability of the given VM SKU + /// + NodeAllocationFailed, + + /// + /// Azure Batch pre-empted the execution of this task while running on a low-priority node + /// + NodePreempted, + + /// + /// node in an Unusable state detected + /// + NodeUnusable, + + /// + /// Node failed during startup or task execution (for example, ContainerInvalidImage, DiskFull) + /// + NodeFailedDuringStartupOrExecution + } + + /// + /// OutputFileLog describes a single output file. This describes file details after the task has completed successfully, for logging purposes. + /// + /// URL of the file in storage, e.g. s3://bucket/file.txt + /// Path of the file inside the container. Must be an absolute path. + /// Size of the file in bytes. + public record OutputFileLog(Uri Url, string Path, long Size); + + /// + /// TesTask's failure information + /// + /// Failure code. Intended to be machine readable. See . + /// Failure details to be added to . + public record FailureInformation(string Reason, IEnumerable SystemLogs); + } +} diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index 109c1ff89..7174fd7ee 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -128,7 +128,7 @@ await OrchestrateTesTasksOnBatchAsync( #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously async token => GetTesTasks(token), #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously - (tesTasks, token) => batchScheduler.ProcessCompletedTesTasksAsync(tesTasks, tasks.ToArray(), token), + (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, tasks.Select(GetBatchState).ToArray(), token), stoppingToken); async IAsyncEnumerable GetTesTasks([EnumeratorCancellation] CancellationToken cancellationToken) @@ -142,6 +142,29 @@ async IAsyncEnumerable GetTesTasks([EnumeratorCancellation] Cancellatio } } } + + AzureBatchTaskState GetBatchState(CloudTask task) + { + if (task.ExecutionInformation.ExitCode != 0 || task.ExecutionInformation.FailureInformation is not null) + { + return new(AzureBatchTaskState.TaskState.CompletedWithErrors, + Failure: new(task.ExecutionInformation.FailureInformation.Code, + Enumerable.Empty() + .Append(task.ExecutionInformation.FailureInformation.Message) + .Append($"Batch task ExitCode: {task.ExecutionInformation?.ExitCode}, Failure message: {task.ExecutionInformation?.FailureInformation?.Message}") + .Concat(task.ExecutionInformation.FailureInformation.Details.Select(pair => pair.Value))), + BatchTaskStartTime: task.ExecutionInformation.StartTime, + BatchTaskEndTime: task.ExecutionInformation.EndTime, + BatchTaskExitCode: task.ExecutionInformation.ExitCode); + } + else + { + return new(AzureBatchTaskState.TaskState.CompletedSuccessfully, + BatchTaskStartTime: task.ExecutionInformation.StartTime, + BatchTaskEndTime: task.ExecutionInformation.EndTime, + BatchTaskExitCode: task.ExecutionInformation.ExitCode); + } + } } } } diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index da3e99c49..17aa03177 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -67,7 +67,7 @@ public partial class BatchScheduler : IBatchScheduler private readonly IStorageAccessProvider storageAccessProvider; private readonly IBatchQuotaVerifier quotaVerifier; private readonly IBatchSkuInformationProvider skuInformationProvider; - //private readonly List tesTaskStateTransitions; + private readonly IList tesTaskStateTransitions; private readonly bool usePreemptibleVmsOnly; private readonly string batchNodesSubnetId; private readonly bool disableBatchNodesPublicIpAddress; @@ -179,103 +179,113 @@ public BatchScheduler( logger.LogInformation($"usePreemptibleVmsOnly: {usePreemptibleVmsOnly}"); - //static bool tesTaskIsQueuedInitializingOrRunning(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum || tesTask.State == TesState.INITIALIZINGEnum || tesTask.State == TesState.RUNNINGEnum; - //static bool tesTaskIsInitializingOrRunning(TesTask tesTask) => tesTask.State == TesState.INITIALIZINGEnum || tesTask.State == TesState.RUNNINGEnum; - //static bool tesTaskIsQueuedOrInitializing(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum || tesTask.State == TesState.INITIALIZINGEnum; - //static bool tesTaskIsQueued(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum; - //static bool tesTaskCancellationRequested(TesTask tesTask) => tesTask.State == TesState.CANCELINGEnum; - //static bool tesTaskDeletionReady(TesTask tesTask) => tesTask.IsTaskDeletionRequired; - - //static void SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo) - //{ - // tesTask.State = newTaskState; - - // var tesTaskLog = tesTask.GetOrAddTesTaskLog(); - // var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); - - // tesTaskLog.BatchNodeMetrics = batchInfo.BatchNodeMetrics; - // tesTaskLog.CromwellResultCode = batchInfo.CromwellRcCode; - // tesTaskLog.EndTime = DateTime.UtcNow; - // tesTaskExecutorLog.StartTime = batchInfo.BatchTaskStartTime; - // tesTaskExecutorLog.EndTime = batchInfo.BatchTaskEndTime; - // tesTaskExecutorLog.ExitCode = batchInfo.BatchTaskExitCode; - - // // Only accurate when the task completes successfully, otherwise it's the Batch time as reported from Batch - // // TODO this could get large; why? - // //var timefromCoAScriptCompletionToBatchTaskDetectedComplete = tesTaskLog.EndTime - tesTaskExecutorLog.EndTime; - - // tesTask.SetFailureReason(batchInfo.FailureReason); - - // if (batchInfo.SystemLogItems is not null) - // { - // tesTask.AddToSystemLog(batchInfo.SystemLogItems); - // } - // else if (!string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem)) - // { - // tesTask.AddToSystemLog(new[] { batchInfo.AlternateSystemLogItem }); - // } - //} - - //async Task SetTaskCompleted(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) - //{ - // await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - // SetTaskStateAndLog(tesTask, TesState.COMPLETEEnum, batchInfo); - //} - - //async Task SetTaskExecutorError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) - //{ - // await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - // SetTaskStateAndLog(tesTask, TesState.EXECUTORERROREnum, batchInfo); - //} - - //async Task SetTaskSystemError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) - //{ - // await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - // SetTaskStateAndLog(tesTask, TesState.SYSTEMERROREnum, batchInfo); - //} - - //async Task DeleteBatchJobAndSetTaskStateAsync(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) - //{ - // await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - // SetTaskStateAndLog(tesTask, newTaskState, batchInfo); - //} - - //Task DeleteBatchJobAndSetTaskExecutorErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) => DeleteBatchJobAndSetTaskStateAsync(tesTask, TesState.EXECUTORERROREnum, batchInfo, cancellationToken); - //Task DeleteBatchJobAndSetTaskSystemErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) => DeleteBatchJobAndSetTaskStateAsync(tesTask, TesState.SYSTEMERROREnum, batchInfo, cancellationToken); - - //Task DeleteBatchJobAndRequeueTaskAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) - // => ++tesTask.ErrorCount > 3 - // ? AddSystemLogAndDeleteBatchJobAndSetTaskExecutorErrorAsync(tesTask, batchInfo, "System Error: Retry count exceeded.", cancellationToken) - // : DeleteBatchJobAndSetTaskStateAsync(tesTask, TesState.QUEUEDEnum, batchInfo, cancellationToken); - - //Task AddSystemLogAndDeleteBatchJobAndSetTaskExecutorErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, string alternateSystemLogItem, CancellationToken cancellationToken) - //{ - // batchInfo.SystemLogItems ??= Enumerable.Empty().Append(alternateSystemLogItem); - // return DeleteBatchJobAndSetTaskExecutorErrorAsync(tesTask, batchInfo, cancellationToken); - //} - - //Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) - //{ - // logger.LogInformation("The TesTask {TesTask}'s node was preempted. It will be automatically rescheduled.", tesTask.Id); - // tesTask.State = TesState.INITIALIZINGEnum; - // return Task.FromResult(false); - //} - - //tesTaskStateTransitions = new List() - //{ - // new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.Initializing, alternateSystemLogItem: null, (tesTask, _) => tesTask.State = TesState.INITIALIZINGEnum), - // new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, BatchTaskState.NodeAllocationFailed, alternateSystemLogItem: null, DeleteBatchJobAndRequeueTaskAsync), - // new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, BatchTaskState.Running, alternateSystemLogItem: null, (tesTask, _) => tesTask.State = TesState.RUNNINGEnum), - // new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.MoreThanOneActiveJobOrTaskFound, BatchTaskState.MoreThanOneActiveJobOrTaskFound.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), - // new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.CompletedSuccessfully, alternateSystemLogItem: null, SetTaskCompleted), - // new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.CompletedWithErrors, "Please open an issue. There should have been an error reported here.", SetTaskExecutorError), - // new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.ActiveJobWithMissingAutoPool, alternateSystemLogItem: null, DeleteBatchJobAndRequeueTaskAsync), - // new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.NodeFailedDuringStartupOrExecution, "Please open an issue. There should have been an error reported here.", DeleteBatchJobAndSetTaskExecutorErrorAsync), - // new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, BatchTaskState.NodeUnusable, "Please open an issue. There should have been an error reported here.", DeleteBatchJobAndSetTaskExecutorErrorAsync), - // new TesTaskStateTransition(tesTaskIsInitializingOrRunning, BatchTaskState.JobNotFound, BatchTaskState.JobNotFound.ToString(), SetTaskSystemError), - // new TesTaskStateTransition(tesTaskIsInitializingOrRunning, BatchTaskState.MissingBatchTask, BatchTaskState.MissingBatchTask.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), - // new TesTaskStateTransition(tesTaskIsInitializingOrRunning, BatchTaskState.NodePreempted, alternateSystemLogItem: null, HandlePreemptedNodeAsync) - //}; + static bool tesTaskIsQueuedInitializingOrRunning(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum || tesTask.State == TesState.INITIALIZINGEnum || tesTask.State == TesState.RUNNINGEnum; + static bool tesTaskIsInitializingOrRunning(TesTask tesTask) => tesTask.State == TesState.INITIALIZINGEnum || tesTask.State == TesState.RUNNINGEnum; + static bool tesTaskIsQueuedOrInitializing(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum || tesTask.State == TesState.INITIALIZINGEnum; + static bool tesTaskIsQueued(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum; + static bool tesTaskCancellationRequested(TesTask tesTask) => tesTask.State == TesState.CANCELINGEnum; + static bool tesTaskDeletionReady(TesTask tesTask) => tesTask.IsTaskDeletionRequired; + + async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) + { + var metrics = newTaskState == TesState.COMPLETEEnum + ? await GetBatchNodeMetricsAndCromwellResultCodeAsync(tesTask, cancellationToken) + : default; + + tesTask.State = newTaskState; + + var tesTaskLog = tesTask.GetOrAddTesTaskLog(); + var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); + + tesTaskLog.BatchNodeMetrics = metrics.BatchNodeMetrics; + tesTaskLog.CromwellResultCode = metrics.CromwellRcCode; + tesTaskLog.EndTime = DateTime.UtcNow; + tesTaskExecutorLog.StartTime = metrics.TaskStartTime ?? batchInfo.BatchTaskStartTime; + tesTaskExecutorLog.EndTime = metrics.TaskEndTime ?? batchInfo.BatchTaskEndTime; + tesTaskExecutorLog.ExitCode = batchInfo.BatchTaskExitCode; + + // Only accurate when the task completes successfully, otherwise it's the Batch time as reported from Batch + // TODO this could get large; why? + //var timefromCoAScriptCompletionToBatchTaskDetectedComplete = tesTaskLog.EndTime - tesTaskExecutorLog.EndTime; + + if (batchInfo.Failure is not null) + { + tesTask.SetFailureReason(batchInfo.Failure.Reason); + + if (batchInfo.Failure.SystemLogs is not null) + { + tesTask.AddToSystemLog(batchInfo.Failure.SystemLogs); + } + //else if (!string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem)) + //{ + // tesTask.AddToSystemLog(new[] { batchInfo.AlternateSystemLogItem }); + //} + } + } + + async Task SetTaskCompleted(TesTask tesTask, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) + { + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + await SetTaskStateAndLog(tesTask, TesState.COMPLETEEnum, batchInfo, cancellationToken); + } + + async Task SetTaskExecutorError(TesTask tesTask, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) + { + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + await SetTaskStateAndLog(tesTask, TesState.EXECUTORERROREnum, batchInfo, cancellationToken); + } + + async Task SetTaskSystemError(TesTask tesTask, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) + { + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + await SetTaskStateAndLog(tesTask, TesState.SYSTEMERROREnum, batchInfo, cancellationToken); + } + + async Task DeleteBatchJobAndSetTaskStateAsync(TesTask tesTask, TesState newTaskState, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) + { + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + await SetTaskStateAndLog(tesTask, newTaskState, batchInfo, cancellationToken); + } + + Task DeleteBatchJobAndSetTaskExecutorErrorAsync(TesTask tesTask, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) => DeleteBatchJobAndSetTaskStateAsync(tesTask, TesState.EXECUTORERROREnum, batchInfo, cancellationToken); + Task DeleteBatchJobAndSetTaskSystemErrorAsync(TesTask tesTask, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) => DeleteBatchJobAndSetTaskStateAsync(tesTask, TesState.SYSTEMERROREnum, batchInfo, cancellationToken); + + Task DeleteBatchJobAndRequeueTaskAsync(TesTask tesTask, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) + => ++tesTask.ErrorCount > 3 + ? AddSystemLogAndDeleteBatchJobAndSetTaskExecutorErrorAsync(tesTask, batchInfo, "System Error: Retry count exceeded.", cancellationToken) + : DeleteBatchJobAndSetTaskStateAsync(tesTask, TesState.QUEUEDEnum, batchInfo, cancellationToken); + + Task AddSystemLogAndDeleteBatchJobAndSetTaskExecutorErrorAsync(TesTask tesTask, AzureBatchTaskState batchInfo, string alternateSystemLogItem, CancellationToken cancellationToken) + { + batchInfo = new(batchInfo.State, batchInfo.OutputFileLogs, new(batchInfo.Failure.Reason, (batchInfo.Failure.SystemLogs ?? Enumerable.Empty()).Append(alternateSystemLogItem)), batchInfo.CloudTaskCreationTime, batchInfo.BatchTaskStartTime, batchInfo.BatchTaskEndTime, batchInfo.BatchTaskExitCode); + return DeleteBatchJobAndSetTaskExecutorErrorAsync(tesTask, batchInfo, cancellationToken); + } + + Task HandlePreemptedNodeAsync(TesTask tesTask, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) + { + logger.LogInformation("The TesTask {TesTask}'s node was preempted. It will be automatically rescheduled.", tesTask.Id); + tesTask.State = TesState.INITIALIZINGEnum; + return Task.FromResult(false); + } + + tesTaskStateTransitions = new List() + { + //new TesTaskStateTransition(tesTaskCancellationRequested, batchTaskState: null, alternateSystemLogItem: null, CancelTaskAsync), + //new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.JobNotFound, alternateSystemLogItem: null, (tesTask, _, ct) => AddBatchTaskAsync(tesTask, ct)), + //new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.MissingBatchTask, alternateSystemLogItem: null, (tesTask, _, ct) => AddBatchTaskAsync(tesTask, ct)), + //new TesTaskStateTransition(tesTaskIsQueued, AzureBatchTaskState.TaskState.Initializing, alternateSystemLogItem: null, (tesTask, _) => tesTask.State = TesState.INITIALIZINGEnum), + new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, AzureBatchTaskState.TaskState.NodeAllocationFailed, alternateSystemLogItem: null, DeleteBatchJobAndRequeueTaskAsync), + new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, AzureBatchTaskState.TaskState.Running, alternateSystemLogItem: null, (tesTask, _) => tesTask.State = TesState.RUNNINGEnum), + //new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.MoreThanOneActiveJobOrTaskFound, BatchTaskState.MoreThanOneActiveJobOrTaskFound.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), + new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedSuccessfully, alternateSystemLogItem: null, SetTaskCompleted), + new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedWithErrors, "Please open an issue. There should have been an error reported here.", SetTaskExecutorError), + //new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.ActiveJobWithMissingAutoPool, alternateSystemLogItem: null, DeleteBatchJobAndRequeueTaskAsync), + new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, "Please open an issue. There should have been an error reported here.", DeleteBatchJobAndSetTaskExecutorErrorAsync), + new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.NodeUnusable, "Please open an issue. There should have been an error reported here.", DeleteBatchJobAndSetTaskExecutorErrorAsync), + //new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.JobNotFound, BatchTaskState.JobNotFound.ToString(), SetTaskSystemError), + //new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.MissingBatchTask, BatchTaskState.MissingBatchTask.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), + new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodePreempted, alternateSystemLogItem: null, HandlePreemptedNodeAsync) + }.AsReadOnly(); } private async Task DeleteCancelledTaskAsync(TesTask tesTask, CancellationToken cancellationToken) @@ -362,83 +372,74 @@ public async Task UploadTaskRunnerIfNeeded(CancellationToken cancellationToken) } } - /// - public async IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCompletedTesTasksAsync(IEnumerable tesTasks, CloudTask[] cloudTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) - { - ArgumentNullException.ThrowIfNull(tesTasks); - ArgumentNullException.ThrowIfNull(cloudTasks); - - foreach (var (cloudTask, tesTask) in cloudTasks.Zip(tesTasks)) - { - // Skip if this task was previously updated. - if (tesTask.State != TesState.COMPLETEEnum && tesTask.State != TesState.SYSTEMERROREnum && tesTask.State != TesState.EXECUTORERROREnum) - { - try - { - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - } - catch (Exception ex) - { - logger.LogError(ex, @"Failure when deleting cloud task for TES task: {TesTask} Exception: {ExceptionType}: {ExceptionMessage}", tesTask.Id, ex.GetType().FullName, ex.Message); - tesTask.IsTaskDeletionRequired = true; - } - - if (cloudTask.ExecutionInformation.ExitCode != 0 || cloudTask.ExecutionInformation.FailureInformation is not null) - { - await SetTaskStateAndLog(tesTask, TesState.EXECUTORERROREnum, cloudTask); - } - else - { - await SetTaskStateAndLog(tesTask, TesState.COMPLETEEnum, cloudTask); - } + ///// + //public async IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessCompletedTesTasksAsync(IEnumerable tesTasks, CloudTask[] cloudTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + //{ + // ArgumentNullException.ThrowIfNull(tesTasks); + // ArgumentNullException.ThrowIfNull(cloudTasks); - yield return (tesTask, Task.FromResult(true)); - } - } + // foreach (var (cloudTask, tesTask) in cloudTasks.Zip(tesTasks)) + // { + // // Skip if this task was previously updated. + // if (tesTask.State != TesState.COMPLETEEnum && tesTask.State != TesState.SYSTEMERROREnum && tesTask.State != TesState.EXECUTORERROREnum) + // { + // try + // { + // await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + // } + // catch (Exception ex) + // { + // logger.LogError(ex, @"Failure when deleting cloud task for TES task: {TesTask} Exception: {ExceptionType}: {ExceptionMessage}", tesTask.Id, ex.GetType().FullName, ex.Message); + // tesTask.IsTaskDeletionRequired = true; + // } - async ValueTask SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CloudTask cloudTask) - { - var metrics = newTaskState == TesState.COMPLETEEnum - ? await GetBatchNodeMetricsAndCromwellResultCodeAsync(tesTask, cancellationToken) - : default; + // if (cloudTask.ExecutionInformation.ExitCode != 0 || cloudTask.ExecutionInformation.FailureInformation is not null) + // { + // await SetTaskStateAndLog(tesTask, TesState.EXECUTORERROREnum, cloudTask); + // } + // else + // { + // await SetTaskStateAndLog(tesTask, TesState.COMPLETEEnum, cloudTask); + // } - tesTask.State = newTaskState; + // yield return (tesTask, Task.FromResult(true)); + // } + // } - var tesTaskLog = tesTask.GetOrAddTesTaskLog(); - var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); + // async ValueTask SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CloudTask cloudTask) + // { + // var metrics = newTaskState == TesState.COMPLETEEnum + // ? await GetBatchNodeMetricsAndCromwellResultCodeAsync(tesTask, cancellationToken) + // : default; - tesTaskLog.BatchNodeMetrics = metrics.BatchNodeMetrics; - tesTaskLog.CromwellResultCode = metrics.CromwellRcCode; - tesTaskLog.EndTime = DateTime.UtcNow; - tesTaskExecutorLog.StartTime = metrics.TaskStartTime ?? cloudTask.ExecutionInformation?.StartTime; - tesTaskExecutorLog.EndTime = metrics.TaskEndTime ?? cloudTask.ExecutionInformation?.EndTime; - tesTaskExecutorLog.ExitCode = cloudTask.ExecutionInformation?.ExitCode; + // tesTask.State = newTaskState; - // Only accurate when the task completes successfully, otherwise it's the Batch time as reported from Batch - // TODO this could get large; why? - //var timefromCoAScriptCompletionToBatchTaskDetectedComplete = tesTaskLog.EndTime - tesTaskExecutorLog.EndTime; + // var tesTaskLog = tesTask.GetOrAddTesTaskLog(); + // var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); - tesTask.SetFailureReason(cloudTask.ExecutionInformation?.FailureInformation?.Code); + // tesTaskLog.BatchNodeMetrics = metrics.BatchNodeMetrics; + // tesTaskLog.CromwellResultCode = metrics.CromwellRcCode; + // tesTaskLog.EndTime = DateTime.UtcNow; + // tesTaskExecutorLog.StartTime = metrics.TaskStartTime ?? cloudTask.ExecutionInformation?.StartTime; + // tesTaskExecutorLog.EndTime = metrics.TaskEndTime ?? cloudTask.ExecutionInformation?.EndTime; + // tesTaskExecutorLog.ExitCode = cloudTask.ExecutionInformation?.ExitCode; - var systemLogItems = newTaskState == TesState.COMPLETEEnum - ? default - : Enumerable.Empty() - .Append($"Batch task ExitCode: {cloudTask.ExecutionInformation?.ExitCode}, Failure message: {cloudTask.ExecutionInformation?.FailureInformation?.Message}") - .Concat(cloudTask.ExecutionInformation?.FailureInformation?.Details?.Select(d => $"{d.Name}: {d.Value}") ?? Enumerable.Empty()); + // // Only accurate when the task completes successfully, otherwise it's the Batch time as reported from Batch + // // TODO this could get large; why? + // //var timefromCoAScriptCompletionToBatchTaskDetectedComplete = tesTaskLog.EndTime - tesTaskExecutorLog.EndTime; - if (systemLogItems is not null) - { - tesTask.AddToSystemLog(systemLogItems); - } - else if (newTaskState != TesState.COMPLETEEnum) - { - tesTask.AddToSystemLog(new[] { "Please open an issue. There should have been an error reported here." }); - } - } - } + // if (newTaskState != TesState.COMPLETEEnum) + // { + // tesTask.SetFailureReason(cloudTask.ExecutionInformation?.FailureInformation?.Code); + // tesTask.AddToSystemLog(Enumerable.Empty() + // .Append($"Batch task ExitCode: {cloudTask.ExecutionInformation?.ExitCode}, Failure message: {cloudTask.ExecutionInformation?.FailureInformation?.Message}") + // .Concat(cloudTask.ExecutionInformation?.FailureInformation?.Details?.Select(d => $"{d.Name}: {d.Value}") ?? Enumerable.Empty())); + // } + // } + //} /// - public async IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCancelledTesTasksAsync(IEnumerable tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + public async IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessCancelledTesTasksAsync(IEnumerable tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { foreach (var task in tesTasks) { @@ -458,7 +459,7 @@ async ValueTask SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, Cloud } /// - public async IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessTerminatedTesTasksAsync(IEnumerable tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + public async IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessTerminatedTesTasksAsync(IEnumerable tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { foreach (var task in tesTasks) { @@ -561,7 +562,7 @@ private bool TryGetCromwellTmpFilePath(string fileUri, out string localPath) } /// - public async IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + public async IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { var taskMetadataListByPoolKey = new Dictionary>(); var poolKeyByTaskIds = new Dictionary(); @@ -1040,27 +1041,42 @@ Task HandleException(Exception exception, string poolKey, TesTask tesTask) // } //} - ///// - ///// Transitions the to the new state, based on the rules defined in the tesTaskStateTransitions list. - ///// - ///// TES task - ///// Current Azure Batch task info - ///// A for controlling the lifetime of the asynchronous operation. - ///// True if the TES task was changed. - //// When task is executed the following may be touched: - //// tesTask.Log[].SystemLog - //// tesTask.Log[].FailureReason - //// tesTask.Log[].CromwellResultCode - //// tesTask.Log[].BatchExecutionMetrics - //// tesTask.Log[].EndTime - //// tesTask.Log[].Log[].StdErr - //// tesTask.Log[].Log[].ExitCode - //// tesTask.Log[].Log[].StartTime - //// tesTask.Log[].Log[].EndTime - //private ValueTask HandleTesTaskTransitionAsync(TesTask tesTask, CombinedBatchTaskInfo combinedBatchTaskInfo, CancellationToken cancellationToken) - // => (tesTaskStateTransitions - // .FirstOrDefault(m => (m.Condition is null || m.Condition(tesTask)) && (m.CurrentBatchTaskState is null || m.CurrentBatchTaskState == combinedBatchTaskInfo.BatchTaskState)) - // ?.ActionAsync(tesTask, combinedBatchTaskInfo, cancellationToken) ?? ValueTask.FromResult(false)); + + /// + public async IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + { + ArgumentNullException.ThrowIfNull(tesTasks); + ArgumentNullException.ThrowIfNull(taskStates); + + await foreach (var result in taskStates.Zip(tesTasks) + .Select(entry => (entry.Second, HandleTesTaskTransitionAsync(entry.Second, entry.First, cancellationToken).AsTask())) + .WhenEach(cancellationToken, tuple => tuple.Item2)) + { + yield return result; + } + } + + /// + /// Transitions the to the new state, based on the rules defined in the tesTaskStateTransitions list. + /// + /// TES task + /// Current Azure Batch task info + /// A for controlling the lifetime of the asynchronous operation. + /// True if the TES task was changed. + // When task is executed the following may be touched: + // tesTask.Log[].SystemLog + // tesTask.Log[].FailureReason + // tesTask.Log[].CromwellResultCode + // tesTask.Log[].BatchExecutionMetrics + // tesTask.Log[].EndTime + // tesTask.Log[].Log[].StdErr + // tesTask.Log[].Log[].ExitCode + // tesTask.Log[].Log[].StartTime + // tesTask.Log[].Log[].EndTime + private ValueTask HandleTesTaskTransitionAsync(TesTask tesTask, AzureBatchTaskState azureBatchTaskState, CancellationToken cancellationToken) + => (tesTaskStateTransitions + .FirstOrDefault(m => (m.Condition is null || m.Condition(tesTask)) && (m.CurrentBatchTaskState is null || m.CurrentBatchTaskState == azureBatchTaskState.State)) + ?.ActionAsync(tesTask, azureBatchTaskState, cancellationToken) ?? ValueTask.FromResult(false)); /// /// Returns job preparation and main Batch tasks that represents the given @@ -1821,7 +1837,7 @@ public async Task GetVmSizeAsync(TesTask tesTask, Can var tesResources = tesTask.Resources; var previouslyFailedVmSizes = tesTask.Logs? - .Where(log => log.FailureReason == BatchTaskState.NodeAllocationFailed.ToString() && log.VirtualMachineInfo?.VmSize is not null) + .Where(log => log.FailureReason == AzureBatchTaskState.TaskState.NodeAllocationFailed.ToString() && log.VirtualMachineInfo?.VmSize is not null) .Select(log => log.VirtualMachineInfo.VmSize) .Distinct() .ToList(); @@ -1902,7 +1918,7 @@ public async Task GetVmSizeAsync(TesTask tesTask, Can if (previouslyFailedVmSizes is not null) { - noVmFoundMessage += $" The following VM sizes were excluded from consideration because of {BatchTaskState.NodeAllocationFailed} error(s) on previous attempts: {string.Join(", ", previouslyFailedVmSizes)}."; + noVmFoundMessage += $" The following VM sizes were excluded from consideration because of {AzureBatchTaskState.TaskState.NodeAllocationFailed} error(s) on previous attempts: {string.Join(", ", previouslyFailedVmSizes)}."; } var vmsExcludedByTheAllowedVmsConfiguration = eligibleVms.Except(eligibleVms.Where(allowedVmSizesFilter)).Count(); @@ -2032,85 +2048,77 @@ private static Dictionary DelimitedTextToDictionary(string text, .Select(line => { var parts = line.Split(fieldDelimiter); return new KeyValuePair(parts[0], parts[1]); }) .ToDictionary(kv => kv.Key, kv => kv.Value); - ///// - ///// Class that captures how transitions from current state to the new state, given the current Batch task state and optional condition. - ///// Transitions typically include an action that needs to run in order for the task to move to the new state. - ///// - //private class TesTaskStateTransition - //{ - // public TesTaskStateTransition(Func condition, BatchTaskState? batchTaskState, string alternateSystemLogItem, Func asyncAction) - // : this(condition, batchTaskState, alternateSystemLogItem, asyncAction, null) - // { } + /// + /// Class that captures how transitions from current state to the new state, given the current Batch task state and optional condition. + /// Transitions typically include an action that needs to run in order for the task to move to the new state. + /// + private class TesTaskStateTransition + { + public TesTaskStateTransition(Func condition, AzureBatchTaskState.TaskState? batchTaskState, string alternateSystemLogItem, Func asyncAction) + : this(condition, batchTaskState, alternateSystemLogItem, asyncAction, null) + { } - // public TesTaskStateTransition(Func condition, BatchTaskState? batchTaskState, string alternateSystemLogItem, Action action) - // : this(condition, batchTaskState, alternateSystemLogItem, null, action) - // { - // } + public TesTaskStateTransition(Func condition, AzureBatchTaskState.TaskState? batchTaskState, string alternateSystemLogItem, Action action) + : this(condition, batchTaskState, alternateSystemLogItem, null, action) + { + } - // private TesTaskStateTransition(Func condition, BatchTaskState? batchTaskState, string alternateSystemLogItem, Func asyncAction, Action action) - // { - // Condition = condition; - // CurrentBatchTaskState = batchTaskState; - // AlternateSystemLogItem = alternateSystemLogItem; - // AsyncAction = asyncAction; - // Action = action; - // } + private TesTaskStateTransition(Func condition, AzureBatchTaskState.TaskState? batchTaskState, string alternateSystemLogItem, Func asyncAction, Action action) + { + Condition = condition; + CurrentBatchTaskState = batchTaskState; + AlternateSystemLogItem = alternateSystemLogItem; + AsyncAction = asyncAction; + Action = action; + } - // public Func Condition { get; } - // public BatchTaskState? CurrentBatchTaskState { get; } - // private string AlternateSystemLogItem { get; } - // private Func AsyncAction { get; } - // private Action Action { get; } - - // /// - // /// Calls and/or . - // /// - // /// - // /// - // /// A for controlling the lifetime of the asynchronous operation. - // /// True an action was called, otherwise False. - // public async ValueTask ActionAsync(TesTask tesTask, CombinedBatchTaskInfo combinedBatchTaskInfo, CancellationToken cancellationToken) - // { - // combinedBatchTaskInfo.AlternateSystemLogItem = AlternateSystemLogItem; - // var tesTaskChanged = false; + public Func Condition { get; } + public AzureBatchTaskState.TaskState? CurrentBatchTaskState { get; } + private string AlternateSystemLogItem { get; } + private Func AsyncAction { get; } + private Action Action { get; } - // if (AsyncAction is not null) - // { - // await AsyncAction(tesTask, combinedBatchTaskInfo, cancellationToken); - // tesTaskChanged = true; - // } + /// + /// Calls and/or . + /// + /// + /// + /// A for controlling the lifetime of the asynchronous operation. + /// True an action was called, otherwise False. + public async ValueTask ActionAsync(TesTask tesTask, AzureBatchTaskState combinedBatchTaskInfo, CancellationToken cancellationToken) + { + //combinedBatchTaskInfo.AlternateSystemLogItem = AlternateSystemLogItem; + var tesTaskChanged = false; - // if (Action is not null) - // { - // Action(tesTask, combinedBatchTaskInfo); - // tesTaskChanged = true; - // } + if (AsyncAction is not null) + { + await AsyncAction(tesTask, combinedBatchTaskInfo, cancellationToken); + tesTaskChanged = true; + } - // return tesTaskChanged; - // } - //} + if (Action is not null) + { + Action(tesTask, combinedBatchTaskInfo); + tesTaskChanged = true; + } - private class ExternalStorageContainerInfo - { - public string AccountName { get; set; } - public string ContainerName { get; set; } - public string BlobEndpoint { get; set; } - public string SasToken { get; set; } + return tesTaskChanged; + } } - private class CombinedBatchTaskInfo - { - public BatchTaskState BatchTaskState { get; set; } - public BatchNodeMetrics BatchNodeMetrics { get; set; } - public string FailureReason { get; set; } - public DateTimeOffset? BatchTaskStartTime { get; set; } - public DateTimeOffset? BatchTaskEndTime { get; set; } - public int? BatchTaskExitCode { get; set; } - public int? CromwellRcCode { get; set; } - public IEnumerable SystemLogItems { get; set; } - public PoolInformation Pool { get; set; } - public string AlternateSystemLogItem { get; set; } - } + //private class CombinedBatchTaskInfo + //{ + // public BatchTaskState BatchTaskState { get; set; } + // public BatchNodeMetrics BatchNodeMetrics { get; set; } + // public string FailureReason { get; set; } + // public DateTimeOffset? BatchTaskStartTime { get; set; } + // public DateTimeOffset? BatchTaskEndTime { get; set; } + // public int? BatchTaskExitCode { get; set; } + // public int? CromwellRcCode { get; set; } + // public IEnumerable SystemLogItems { get; set; } + // public PoolInformation Pool { get; set; } + // public string AlternateSystemLogItem { get; set; } + //} private class TesInputPathEqualityComparer : IEqualityComparer { diff --git a/src/TesApi.Web/BatchTaskState.cs b/src/TesApi.Web/BatchTaskState.cs deleted file mode 100644 index 028ecc1c2..000000000 --- a/src/TesApi.Web/BatchTaskState.cs +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -namespace TesApi.Web -{ - /// - /// A superset of Azure Batch task states - /// - public enum BatchTaskState - { - /// - /// The task has been assigned to a compute node, but is waiting for a - /// required Job Preparation task to complete on the node. - /// - Initializing, - - /// - /// The task is running on a compute node. - /// - Running, - - /// - /// The task is no longer eligible to run, usually because the task has - /// finished successfully, or the task has finished unsuccessfully and - /// has exhausted its retry limit. A task is also marked as completed - /// if an error occurred launching the task, or when the task has been - /// terminated. - /// - CompletedSuccessfully, - - /// - /// The task has completed, but it finished unsuccessfully or the executor had an error - /// - CompletedWithErrors, - - /// - /// Batch job is active and has non-null ExecutionInformation.PoolId but pool does not exist - /// - ActiveJobWithMissingAutoPool, - - /// - /// Batch job is either not found or is being deleted - /// - JobNotFound, - - /// - /// An error occurred while attempting to retrieve jobs from the Azure Batch API. This can occur due to a network or service issue - /// - ErrorRetrievingJobs, - - /// - /// More than one active batch job/task is associated with a given tes task. This may indicate an implementation defect due to an unforeseen edge case - /// - MoreThanOneActiveJobOrTaskFound, - - /// - /// Azure Batch was unable to allocate a machine for the job. This could be due to either a temporary or permanent unavailability of the given VM SKU - /// - NodeAllocationFailed, - - /// - /// Azure Batch pre-empted the execution of this task while running on a low-priority node - /// - NodePreempted, - - /// - /// node in an Unusable state detected - /// - NodeUnusable, - - /// - /// Batch job exists but task is missing. This can happen if scheduler goes down after creating the job but before creating the task. - /// - MissingBatchTask, - - /// - /// Node failed during startup or task execution (for example, ContainerInvalidImage, DiskFull) - /// - NodeFailedDuringStartupOrExecution - } -} diff --git a/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs b/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs new file mode 100644 index 000000000..40ac79752 --- /dev/null +++ b/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs @@ -0,0 +1,82 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Threading.Channels; +using System.Threading.Tasks; + +namespace TesApi.Web.Extensions +{ + /// + /// Extension methods for and associated types + /// + public static class SystemThreadingTaskExtensions + { + /// + /// Streams items as their associated tasks complete. + /// + /// Type of items in . + /// Items to be streamed in the order of completion of their associated s. + /// Required parameter. Must not be . + /// Required if is not derived from . Obtains the associated with each element in . + /// in the order of the completion of their associated s. + /// + /// + /// + /// A task is sent to the return enumeration when it is "complete", which is when it either completes successfully, fails (queues an exception), or is cancelled.
+ /// No items in should share an identical instance. + ///
+ public static async IAsyncEnumerable WhenEach(this IEnumerable source, [EnumeratorCancellation] CancellationToken cancellationToken, Func sourceToTask = default) + { + ArgumentNullException.ThrowIfNull(source); + ArgumentNullException.ThrowIfNull(cancellationToken); + + // Ensure we have a usable sourceToTask + sourceToTask ??= typeof(T).IsAssignableTo(typeof(Task)) ? new(i => (i as Task)!) : throw new ArgumentNullException(nameof(sourceToTask)); + + var list = source.Where(e => e is not null).Select(e => (Entry: e, Task: sourceToTask(e))).ToList(); + var pendingCount = list.Count; + + //if (list.Select(e => e.Task).ToHashSet().Count != pendingCount) // Check for duplicate tasks + //{ + // throw new ArgumentException("Duplicate System.Threading.Tasks found referenced in collection.", nameof(source)); + //} + + if (list.Count == 0) + { + yield break; + } + + // There should be no more ArgumentExceptions after this point. + var channel = Channel.CreateBounded(pendingCount); + + // Add continuations to every task. Those continuations will feed the foreach below + _ = Parallel.ForEach(list, tuple => + { + // The continuation task returned with ContinueWith() is attached to the associated task and will be disposed with it. + _ = tuple.Task.ContinueWith(task => + { + _ = channel.Writer.TryWrite(tuple.Entry); + + if (Interlocked.Decrement(ref pendingCount) == 0) + { + channel.Writer.Complete(); + } + }, + cancellationToken, + TaskContinuationOptions.DenyChildAttach, + TaskScheduler.Default); + }); + + // Return all completed entries as their tasks are completed, no matter if by failure, cancellation, or running to completion. + await foreach (var entry in channel.Reader.ReadAllAsync(cancellationToken)) + { + yield return entry; + } + } + } +} diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index 2641d9190..604bbea4f 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -41,7 +41,7 @@ public interface IBatchScheduler /// s to schedule on the batch system. /// A for controlling the lifetime of the asynchronous operation. /// True for each corresponding that needs to be persisted. - IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, CancellationToken cancellationToken); + IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, CancellationToken cancellationToken); /// /// Finalize completed s on a batch system @@ -50,7 +50,7 @@ public interface IBatchScheduler /// s corresponding to each . /// A for controlling the lifetime of the asynchronous operation. /// True for each corresponding that needs to be persisted. - IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCompletedTesTasksAsync(IEnumerable tesTasks, CloudTask[] cloudTasks, CancellationToken cancellationToken); + //IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessCompletedTesTasksAsync(IEnumerable tesTasks, CloudTask[] cloudTasks, CancellationToken cancellationToken); /// /// Terminate cancelled s on a batch system @@ -58,7 +58,7 @@ public interface IBatchScheduler /// s to schedule on the batch system. /// A for controlling the lifetime of the asynchronous operation. /// True for each corresponding that needs to be persisted. - IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessCancelledTesTasksAsync(IEnumerable tesTasks, CancellationToken cancellationToken); + IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessCancelledTesTasksAsync(IEnumerable tesTasks, CancellationToken cancellationToken); /// /// Delete terminated s on a batch system @@ -66,7 +66,16 @@ public interface IBatchScheduler /// s to schedule on the batch system. /// A for controlling the lifetime of the asynchronous operation. /// True for each corresponding that needs to be persisted. - IAsyncEnumerable<(TesTask TesTask, Task IsModified)> ProcessTerminatedTesTasksAsync(IEnumerable tesTasks, CancellationToken cancellationToken); + IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessTerminatedTesTasksAsync(IEnumerable tesTasks, CancellationToken cancellationToken); + + /// + /// Assign pool and compute node errors to s on a batch system + /// + /// s to schedule on the batch system. + /// s corresponding to each . + /// A for controlling the lifetime of the asynchronous operation. + /// True for each corresponding that needs to be persisted. + IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken); //TODO: Add other task update methods here. diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs index 8cbb464af..3e16c77e5 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs @@ -135,7 +135,7 @@ protected async Task ExecuteActionOnIntervalAsync(TimeSpan runInterval, Func, performs an action in the batch system using , and updates the resultant state /// /// A System.Threading.Tasks.ValueTask that represents the long running operations. - protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func>> tesTaskGetter, Func IsModified)>> tesTaskProcessor, CancellationToken stoppingToken) + protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func>> tesTaskGetter, Func IsModifiedAsync)>> tesTaskProcessor, CancellationToken stoppingToken) { var pools = new HashSet(); var tesTasks = await (await tesTaskGetter(stoppingToken)).ToArrayAsync(stoppingToken); From 0c2026b2bbc348c0960512bbc3622c3ec48222f1 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 2 Oct 2023 18:10:36 -0700 Subject: [PATCH 013/202] Minor cleanup --- src/TesApi.Web/BatchScheduler.cs | 12 +++++------- .../Extensions/SystemThreadingTaskExtensions.cs | 8 ++++---- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 17aa03177..1d6a0f8f0 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -1043,17 +1043,15 @@ Task HandleException(Exception exception, string poolKey, TesTask tesTask) /// - public async IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + public IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken) { ArgumentNullException.ThrowIfNull(tesTasks); ArgumentNullException.ThrowIfNull(taskStates); - await foreach (var result in taskStates.Zip(tesTasks) - .Select(entry => (entry.Second, HandleTesTaskTransitionAsync(entry.Second, entry.First, cancellationToken).AsTask())) - .WhenEach(cancellationToken, tuple => tuple.Item2)) - { - yield return result; - } + return taskStates.Zip(tesTasks, (TaskState, TesTask) => (TaskState, TesTask)) + .Where(entry => entry.TesTask.IsActiveState()) + .Select(entry => (entry.TesTask, HandleTesTaskTransitionAsync(entry.TesTask, entry.TaskState, cancellationToken).AsTask())) + .WhenEach(cancellationToken, tuple => tuple.Item2); } /// diff --git a/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs b/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs index 40ac79752..0722bc298 100644 --- a/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs +++ b/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs @@ -41,10 +41,10 @@ public static async IAsyncEnumerable WhenEach(this IEnumerable source, var list = source.Where(e => e is not null).Select(e => (Entry: e, Task: sourceToTask(e))).ToList(); var pendingCount = list.Count; - //if (list.Select(e => e.Task).ToHashSet().Count != pendingCount) // Check for duplicate tasks - //{ - // throw new ArgumentException("Duplicate System.Threading.Tasks found referenced in collection.", nameof(source)); - //} + if (list.Select(e => e.Task).ToHashSet().Count != pendingCount) // Check for duplicate tasks + { + throw new ArgumentException("Duplicate System.Threading.Tasks found referenced in collection.", nameof(source)); + } if (list.Count == 0) { From 446cc702f7a342bea6684631050da96e3f7d24f9 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 3 Oct 2023 18:05:51 -0700 Subject: [PATCH 014/202] Add systemic failure handling, some cleanup, fixed minor errors --- src/TesApi.Tests/BatchSchedulerTests.cs | 752 +++++++++--------- src/TesApi.Web/AzureBatchTaskState.cs | 5 + src/TesApi.Web/BatchPool.cs | 60 +- src/TesApi.Web/BatchPoolService.cs | 57 +- src/TesApi.Web/BatchScheduler.BatchPools.cs | 1 + src/TesApi.Web/BatchScheduler.cs | 251 ++---- .../SystemThreadingTaskExtensions.cs | 13 +- src/TesApi.Web/IBatchPool.cs | 16 +- src/TesApi.Web/IBatchScheduler.cs | 25 - .../OrchestrateOnBatchSchedulerService.cs | 4 +- src/TesApi.Web/Scheduler.cs | 30 +- 11 files changed, 579 insertions(+), 635 deletions(-) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 8a2874a25..6a62b04f0 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -63,7 +63,7 @@ public async Task GetOrAddDoesNotAddExistingAvailablePool() serviceProvider.AzureProxy.Verify(mock => mock.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny()), Times.Once); var pool = await batchScheduler.GetOrAddPoolAsync(key, false, (id, cancellationToken) => ValueTask.FromResult(new Pool(name: id)), System.Threading.CancellationToken.None); - await pool.ServicePoolAsync(); + await foreach (var _ in pool.ServicePoolAsync()) { } Assert.AreEqual(batchScheduler.GetPools().Count(), count); Assert.AreEqual(batchScheduler.GetPoolGroupKeys().Count(), keyCount); @@ -85,7 +85,7 @@ public async Task GetOrAddDoesAddWithExistingUnavailablePool() var count = batchScheduler.GetPools().Count(); var pool = await batchScheduler.GetOrAddPoolAsync(key, false, (id, cancellationToken) => ValueTask.FromResult(new Pool(name: id)), System.Threading.CancellationToken.None); - await pool.ServicePoolAsync(); + await foreach (var _ in pool.ServicePoolAsync()) { } Assert.AreNotEqual(batchScheduler.GetPools().Count(), count); Assert.AreEqual(batchScheduler.GetPoolGroupKeys().Count(), keyCount); @@ -267,21 +267,21 @@ public async Task TesTaskFailsWithSystemErrorWhenTotalBatchQuotaIsSetTooLow() Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(new TesResources { CpuCores = 2, RamGb = 1, Preemptible = false }, azureProxyReturnValues)); } - //[TestMethod] - //public async Task TesTaskFailsWhenBatchNodeDiskIsFull() - //{ - // var tesTask = GetTesTask(); + [TestMethod] + public async Task TesTaskFailsWhenBatchNodeDiskIsFull() + { + var tesTask = GetTesTask(); - // (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask, BatchJobAndTaskStates.NodeDiskFull); + (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask, BatchTaskStates.NodeDiskFull); - // GuardAssertsWithTesTask(tesTask, () => - // { - // Assert.AreEqual(TesState.EXECUTORERROREnum, tesTask.State); - // Assert.AreEqual("DiskFull", failureReason); - // Assert.AreEqual("DiskFull", systemLog[0]); - // Assert.AreEqual("DiskFull", tesTask.FailureReason); - // }); - //} + GuardAssertsWithTesTask(tesTask, () => + { + Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); // TODO: Should be ExecutorError, but this currently falls into the bucket of NodeFailedDuringStartupOrExecution, which also covers StartTask failures, which are more correctly SystemError. + Assert.AreEqual("DiskFull", failureReason); + Assert.AreEqual("DiskFull", systemLog[0]); + Assert.AreEqual("DiskFull", tesTask.FailureReason); + }); + } //TODO: This test (and potentially others) must be reviewed and see if they are necessary considering that the quota verification logic is its own class. // There are a couple of issues: a similar validation already exists in the quota verifier class, and in order to run this test a complex set up is required, which is hard to maintain. @@ -768,312 +768,312 @@ static async Task RunTest(string allowedVmSizes, TesState expectedTaskState, str await RunTest("VmFamily2", TesState.INITIALIZINGEnum, "VmSizeLowPri2"); } - //[TestMethod] - //public async Task TaskStateTransitionsFromRunningState() - //{ - // Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskActive)); - // Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskPreparing)); - // Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskRunning)); - // Assert.AreEqual(TesState.COMPLETEEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskCompletedSuccessfully)); - // Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskFailed)); - // Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.JobNotFound)); - // Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.TaskNotFound)); - // Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.MoreThanOneJobFound)); - // Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.NodeDiskFull)); - // Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.ActiveJobWithMissingAutoPool)); - // Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchJobAndTaskStates.NodePreempted)); - //} - - //[TestMethod] - //public async Task TaskStateTransitionsFromInitializingState() - //{ - // Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskActive)); - // Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskPreparing)); - // Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskRunning)); - // Assert.AreEqual(TesState.COMPLETEEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskCompletedSuccessfully)); - // Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskFailed)); - // Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.JobNotFound)); - // Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.TaskNotFound)); - // Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.MoreThanOneJobFound)); - // Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.NodeDiskFull)); - // Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.NodeAllocationFailed)); - // Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.ImageDownloadFailed)); - // Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.ActiveJobWithMissingAutoPool)); - // Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchJobAndTaskStates.NodePreempted)); - //} - - //[TestMethod] - //public async Task TaskStateTransitionsFromQueuedState() - //{ - // Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskActive)); - // Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskPreparing)); - // Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskRunning)); - // Assert.AreEqual(TesState.COMPLETEEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskCompletedSuccessfully)); - // Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskFailed)); - // Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.MoreThanOneJobFound)); - // Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.NodeDiskFull)); - // Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchJobAndTaskStates.TaskNotFound)); - //} - - //[TestMethod] - //public async Task TaskIsRequeuedUpToThreeTimesForTransientErrors() - //{ - // var tesTask = GetTesTask(); - - // var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - - // azureProxyReturnValues.VmSizesAndPrices = new() { - // new() { VmSize = "VmSize1", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 1 }, - // new() { VmSize = "VmSize2", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 2 }, - // new() { VmSize = "VmSize3", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 3 }, - // new() { VmSize = "VmSize4", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 4 }, - // new() { VmSize = "VmSize5", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 5 } - // }; - - // await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); - // await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed))); - // await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); - // await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed))); - // await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); - // await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed))); - // await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); - // await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed))); - //} - - //[TestMethod] - //public async Task TaskThatFailsWithNodeAllocationErrorIsRequeuedOnDifferentVmSize() - //{ - // var tesTask = GetTesTask(); - - // await GetNewTesTaskStateAsync(tesTask); - // await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed); - // var firstAttemptVmSize = tesTask.Logs[0].VirtualMachineInfo.VmSize; - - // await GetNewTesTaskStateAsync(tesTask); - // await GetNewTesTaskStateAsync(tesTask, BatchJobAndTaskStates.NodeAllocationFailed); - // var secondAttemptVmSize = tesTask.Logs[1].VirtualMachineInfo.VmSize; - - // GuardAssertsWithTesTask(tesTask, () => Assert.AreNotEqual(firstAttemptVmSize, secondAttemptVmSize)); - - // // There are only two suitable VMs, and both have been excluded because of the NodeAllocationFailed error on the two earlier attempts - // _ = await GetNewTesTaskStateAsync(tesTask); - - // GuardAssertsWithTesTask(tesTask, () => - // { - // Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); - // Assert.AreEqual("NoVmSizeAvailable", tesTask.FailureReason); - // }); - //} - - //[TestMethod] - //public async Task TaskGetsCancelled() - //{ - // var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELINGEnum }; - - // var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - // azureProxyReturnValues.BatchJobAndTaskState = BatchJobAndTaskStates.TaskActive; - // Mock azureProxy = default; - // var azureProxySetter = new Action>(mock => - // { - // GetMockAzureProxy(azureProxyReturnValues)(mock); - // azureProxy = mock; - // }); - - // _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, azureProxyReturnValues); - - // GuardAssertsWithTesTask(tesTask, () => - // { - // Assert.AreEqual(TesState.CANCELEDEnum, tesTask.State); - // Assert.IsTrue(tesTask.IsTaskDeletionRequired); - // azureProxy.Verify(i => i.TerminateBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); - // }); - //} - - //[TestMethod] - //public async Task CancelledTaskGetsDeleted() - //{ - // var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELEDEnum, IsTaskDeletionRequired = true, Logs = new() { new() { StartTime = DateTimeOffset.UtcNow - TimeSpan.FromMinutes(11) } } }; - - // var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - // azureProxyReturnValues.BatchJobAndTaskState = BatchJobAndTaskStates.TaskActive; - // Mock azureProxy = default; - // var azureProxySetter = new Action>(mock => - // { - // GetMockAzureProxy(azureProxyReturnValues)(mock); - // azureProxy = mock; - // }); - - // _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, azureProxyReturnValues); - - // GuardAssertsWithTesTask(tesTask, () => - // { - // Assert.IsFalse(tesTask.IsTaskDeletionRequired); - // azureProxy.Verify(i => i.DeleteBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); - // }); - //} - - //[TestMethod] - //public async Task SuccessfullyCompletedTaskContainsBatchNodeMetrics() - //{ - // var tesTask = GetTesTask(); - - // var metricsFileContent = @" - // BlobXferPullStart=2020-10-08T02:30:39+00:00 - // BlobXferPullEnd=2020-10-08T02:31:39+00:00 - // ExecutorPullStart=2020-10-08T02:32:39+00:00 - // ExecutorImageSizeInBytes=3000000000 - // ExecutorPullEnd=2020-10-08T02:34:39+00:00 - // DownloadStart=2020-10-08T02:35:39+00:00 - // FileDownloadSizeInBytes=2000000000 - // DownloadEnd=2020-10-08T02:38:39+00:00 - // ExecutorStart=2020-10-08T02:39:39+00:00 - // ExecutorEnd=2020-10-08T02:43:39+00:00 - // UploadStart=2020-10-08T02:44:39+00:00 - // FileUploadSizeInBytes=4000000000 - // UploadEnd=2020-10-08T02:49:39+00:00 - // DiskSizeInKiB=8000000 - // DiskUsedInKiB=1000000".Replace(" ", string.Empty); - - // var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - // azureProxyReturnValues.BatchJobAndTaskState = BatchJobAndTaskStates.TaskCompletedSuccessfully; - // azureProxyReturnValues.DownloadedBlobContent = metricsFileContent; - - // _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); - - // GuardAssertsWithTesTask(tesTask, () => - // { - // Assert.AreEqual(TesState.COMPLETEEnum, tesTask.State); - - // var batchNodeMetrics = tesTask.GetOrAddTesTaskLog().BatchNodeMetrics; - // Assert.IsNotNull(batchNodeMetrics); - // Assert.AreEqual(60, batchNodeMetrics.BlobXferImagePullDurationInSeconds); - // Assert.AreEqual(120, batchNodeMetrics.ExecutorImagePullDurationInSeconds); - // Assert.AreEqual(3, batchNodeMetrics.ExecutorImageSizeInGB); - // Assert.AreEqual(180, batchNodeMetrics.FileDownloadDurationInSeconds); - // Assert.AreEqual(240, batchNodeMetrics.ExecutorDurationInSeconds); - // Assert.AreEqual(300, batchNodeMetrics.FileUploadDurationInSeconds); - // Assert.AreEqual(1.024, batchNodeMetrics.DiskUsedInGB); - // Assert.AreEqual(12.5f, batchNodeMetrics.DiskUsedPercent); - // Assert.AreEqual(2, batchNodeMetrics.FileDownloadSizeInGB); - // Assert.AreEqual(4, batchNodeMetrics.FileUploadSizeInGB); - - // var executorLog = tesTask.GetOrAddTesTaskLog().GetOrAddExecutorLog(); - // Assert.IsNotNull(executorLog); - // Assert.AreEqual(0, executorLog.ExitCode); - // Assert.AreEqual(DateTimeOffset.Parse("2020-10-08T02:30:39+00:00"), executorLog.StartTime); - // Assert.AreEqual(DateTimeOffset.Parse("2020-10-08T02:49:39+00:00"), executorLog.EndTime); - // }); - //} - - //[TestMethod] - //public async Task SuccessfullyCompletedTaskContainsCromwellResultCode() - //{ - // var tesTask = GetTesTask(); - - // var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - // azureProxyReturnValues.BatchJobAndTaskState = BatchJobAndTaskStates.TaskCompletedSuccessfully; - // azureProxyReturnValues.DownloadedBlobContent = "2"; - // var azureProxy = GetMockAzureProxy(azureProxyReturnValues); - - // _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxy, azureProxyReturnValues); - - // GuardAssertsWithTesTask(tesTask, () => - // { - // Assert.AreEqual(TesState.COMPLETEEnum, tesTask.State); - // Assert.AreEqual(2, tesTask.GetOrAddTesTaskLog().CromwellResultCode); - // Assert.AreEqual(2, tesTask.CromwellResultCode); - // }); - //} - - //[TestMethod] - //public async Task TesInputFilePathMustStartWithCromwellExecutions() - //{ - // var tesTask = GetTesTask(); - - // tesTask.Inputs.Add(new() - // { - // Path = "xyz/path" - // }); - - // (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask); - - // GuardAssertsWithTesTask(tesTask, () => - // { - // Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); - // Assert.AreEqual($"InvalidInputFilePath", failureReason); - // Assert.AreEqual($"InvalidInputFilePath", systemLog[0]); - // Assert.AreEqual($"Unsupported input path 'xyz/path' for task Id {tesTask.Id}. Must start with '/'.", systemLog[1]); - // }); - //} - - //[TestMethod] - //public async Task TesInputFileMustHaveEitherUrlOrContent() - //{ - // var tesTask = GetTesTask(); - - // tesTask.Inputs.Add(new() - // { - // Url = null, - // Content = null, - // Path = "/file1.txt" - // }); - - // (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask); - - // GuardAssertsWithTesTask(tesTask, () => - // { - // Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); - // Assert.AreEqual($"InvalidInputFilePath", failureReason); - // Assert.AreEqual($"InvalidInputFilePath", systemLog[0]); - // Assert.AreEqual($"One of Input Url or Content must be set", systemLog[1]); - // }); - //} - - //[TestMethod] - //public async Task TesInputFileMustNotHaveBothUrlAndContent() - //{ - // var tesTask = GetTesTask(); - - // tesTask.Inputs.Add(new() - // { - // Url = "/storageaccount1/container1/file1.txt", - // Content = "test content", - // Path = "/file1.txt" - // }); - - // (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask); - - // GuardAssertsWithTesTask(tesTask, () => - // { - // Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); - // Assert.AreEqual($"InvalidInputFilePath", failureReason); - // Assert.AreEqual($"InvalidInputFilePath", systemLog[0]); - // Assert.AreEqual($"Input Url and Content cannot be both set", systemLog[1]); - // }); - //} - - //[TestMethod] - //public async Task TesInputFileTypeMustNotBeDirectory() - //{ - // var tesTask = GetTesTask(); - - // tesTask.Inputs.Add(new() - // { - // Url = "/storageaccount1/container1/directory", - // Type = TesFileType.DIRECTORYEnum, - // Path = "/directory" - // }); - - // (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask); - - // GuardAssertsWithTesTask(tesTask, () => - // { - // Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); - // Assert.AreEqual($"InvalidInputFilePath", failureReason); - // Assert.AreEqual($"InvalidInputFilePath", systemLog[0]); - // Assert.AreEqual($"Directory input is not supported.", systemLog[1]); - // }); - //} + [TestMethod] + public async Task TaskStateTransitionsFromRunningState() + { + //Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.TaskActive)); + Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.TaskPreparing)); + Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.TaskRunning)); + Assert.AreEqual(TesState.COMPLETEEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.TaskCompletedSuccessfully)); + Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.TaskFailed)); + //Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.JobNotFound)); + //Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.TaskNotFound)); + //Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.MoreThanOneJobFound)); + Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.NodeDiskFull)); // TODO: Should be ExecutorError, but this currently falls into the bucket of NodeFailedDuringStartupOrExecution, which also covers StartTask failures, which are more correctly SystemError. + //Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.ActiveJobWithMissingAutoPool)); + Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.NodePreempted)); + } + + [TestMethod] + public async Task TaskStateTransitionsFromInitializingState() + { + //Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.TaskActive)); + Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.TaskPreparing)); + Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.TaskRunning)); + Assert.AreEqual(TesState.COMPLETEEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.TaskCompletedSuccessfully)); + Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.TaskFailed)); + //Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.JobNotFound)); + //Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.TaskNotFound)); + //Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.MoreThanOneJobFound)); + Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.NodeDiskFull)); // TODO: Should be ExecutorError, but this currently falls into the bucket of NodeFailedDuringStartupOrExecution, which also covers StartTask failures, which are more correctly SystemError. + Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.NodeAllocationFailed)); + //Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.ImageDownloadFailed)); + //Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.ActiveJobWithMissingAutoPool)); + Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.NodePreempted)); + } + + [TestMethod] + public async Task TaskStateTransitionsFromQueuedState() + { + Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.TaskActive)); + //Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.TaskPreparing)); + Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.TaskRunning)); + Assert.AreEqual(TesState.COMPLETEEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.TaskCompletedSuccessfully)); + Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.TaskFailed)); + //Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.MoreThanOneJobFound)); + Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.NodeDiskFull)); // TODO: Should be ExecutorError, but this currently falls into the bucket of NodeFailedDuringStartupOrExecution, which also covers StartTask failures, which are more correctly SystemError. + //Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.TaskNotFound)); + } + + [TestMethod] + public async Task TaskIsRequeuedUpToThreeTimesForTransientErrors() + { + var tesTask = GetTesTask(); + + var azureProxyReturnValues = AzureProxyReturnValues.Defaults; + + azureProxyReturnValues.VmSizesAndPrices = new() { + new() { VmSize = "VmSize1", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 1 }, + new() { VmSize = "VmSize2", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 2 }, + new() { VmSize = "VmSize3", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 3 }, + new() { VmSize = "VmSize4", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 4 }, + new() { VmSize = "VmSize5", LowPriority = false, VCpusAvailable = 2, MemoryInGiB = 4, ResourceDiskSizeInGiB = 20, PricePerHour = 5 } + }; + + await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); + await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed))); + await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); + await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed))); + await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); + await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed))); + await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); + await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed))); + } + + [TestMethod] + public async Task TaskThatFailsWithNodeAllocationErrorIsRequeuedOnDifferentVmSize() + { + var tesTask = GetTesTask(); + + await GetNewTesTaskStateAsync(tesTask); + await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed); + var firstAttemptVmSize = tesTask.Logs[0].VirtualMachineInfo.VmSize; + + await GetNewTesTaskStateAsync(tesTask); + await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed); + var secondAttemptVmSize = tesTask.Logs[1].VirtualMachineInfo.VmSize; + + GuardAssertsWithTesTask(tesTask, () => Assert.AreNotEqual(firstAttemptVmSize, secondAttemptVmSize)); + + // There are only two suitable VMs, and both have been excluded because of the NodeAllocationFailed error on the two earlier attempts + _ = await GetNewTesTaskStateAsync(tesTask); + + GuardAssertsWithTesTask(tesTask, () => + { + Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); + Assert.AreEqual("NoVmSizeAvailable", tesTask.FailureReason); + }); + } + + [TestMethod] + public async Task TaskGetsCancelled() + { + var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELINGEnum }; + + var azureProxyReturnValues = AzureProxyReturnValues.Defaults; + azureProxyReturnValues.BatchTaskState = BatchTaskStates.TaskPreparing; + Mock azureProxy = default; + var azureProxySetter = new Action>(mock => + { + GetMockAzureProxy(azureProxyReturnValues)(mock); + azureProxy = mock; + }); + + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, azureProxyReturnValues); + + GuardAssertsWithTesTask(tesTask, () => + { + Assert.AreEqual(TesState.CANCELEDEnum, tesTask.State); + Assert.IsTrue(tesTask.IsTaskDeletionRequired); + azureProxy.Verify(i => i.TerminateBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); + }); + } + + [TestMethod] + public async Task CancelledTaskGetsDeleted() + { + var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELEDEnum, IsTaskDeletionRequired = true, Logs = new() { new() { StartTime = DateTimeOffset.UtcNow - TimeSpan.FromMinutes(11) } } }; + + var azureProxyReturnValues = AzureProxyReturnValues.Defaults; + azureProxyReturnValues.BatchTaskState = BatchTaskStates.Terminated; + Mock azureProxy = default; + var azureProxySetter = new Action>(mock => + { + GetMockAzureProxy(azureProxyReturnValues)(mock); + azureProxy = mock; + }); + + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, azureProxyReturnValues); + + GuardAssertsWithTesTask(tesTask, () => + { + Assert.IsFalse(tesTask.IsTaskDeletionRequired); + azureProxy.Verify(i => i.DeleteBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); + }); + } + + [TestMethod] + public async Task SuccessfullyCompletedTaskContainsBatchNodeMetrics() + { + var tesTask = GetTesTask(); + + var metricsFileContent = @" + BlobXferPullStart=2020-10-08T02:30:39+00:00 + BlobXferPullEnd=2020-10-08T02:31:39+00:00 + ExecutorPullStart=2020-10-08T02:32:39+00:00 + ExecutorImageSizeInBytes=3000000000 + ExecutorPullEnd=2020-10-08T02:34:39+00:00 + DownloadStart=2020-10-08T02:35:39+00:00 + FileDownloadSizeInBytes=2000000000 + DownloadEnd=2020-10-08T02:38:39+00:00 + ExecutorStart=2020-10-08T02:39:39+00:00 + ExecutorEnd=2020-10-08T02:43:39+00:00 + UploadStart=2020-10-08T02:44:39+00:00 + FileUploadSizeInBytes=4000000000 + UploadEnd=2020-10-08T02:49:39+00:00 + DiskSizeInKiB=8000000 + DiskUsedInKiB=1000000".Replace(" ", string.Empty); + + var azureProxyReturnValues = AzureProxyReturnValues.Defaults; + azureProxyReturnValues.BatchTaskState = BatchTaskStates.TaskCompletedSuccessfully; + azureProxyReturnValues.DownloadedBlobContent = metricsFileContent; + + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); + + GuardAssertsWithTesTask(tesTask, () => + { + Assert.AreEqual(TesState.COMPLETEEnum, tesTask.State); + + var batchNodeMetrics = tesTask.GetOrAddTesTaskLog().BatchNodeMetrics; + Assert.IsNotNull(batchNodeMetrics); + Assert.AreEqual(60, batchNodeMetrics.BlobXferImagePullDurationInSeconds); + Assert.AreEqual(120, batchNodeMetrics.ExecutorImagePullDurationInSeconds); + Assert.AreEqual(3, batchNodeMetrics.ExecutorImageSizeInGB); + Assert.AreEqual(180, batchNodeMetrics.FileDownloadDurationInSeconds); + Assert.AreEqual(240, batchNodeMetrics.ExecutorDurationInSeconds); + Assert.AreEqual(300, batchNodeMetrics.FileUploadDurationInSeconds); + Assert.AreEqual(1.024, batchNodeMetrics.DiskUsedInGB); + Assert.AreEqual(12.5f, batchNodeMetrics.DiskUsedPercent); + Assert.AreEqual(2, batchNodeMetrics.FileDownloadSizeInGB); + Assert.AreEqual(4, batchNodeMetrics.FileUploadSizeInGB); + + var executorLog = tesTask.GetOrAddTesTaskLog().GetOrAddExecutorLog(); + Assert.IsNotNull(executorLog); + Assert.AreEqual(0, executorLog.ExitCode); + Assert.AreEqual(DateTimeOffset.Parse("2020-10-08T02:30:39+00:00"), executorLog.StartTime); + Assert.AreEqual(DateTimeOffset.Parse("2020-10-08T02:49:39+00:00"), executorLog.EndTime); + }); + } + + [TestMethod] + public async Task SuccessfullyCompletedTaskContainsCromwellResultCode() + { + var tesTask = GetTesTask(); + + var azureProxyReturnValues = AzureProxyReturnValues.Defaults; + azureProxyReturnValues.BatchTaskState = BatchTaskStates.TaskCompletedSuccessfully; + azureProxyReturnValues.DownloadedBlobContent = "2"; + var azureProxy = GetMockAzureProxy(azureProxyReturnValues); + + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxy, azureProxyReturnValues); + + GuardAssertsWithTesTask(tesTask, () => + { + Assert.AreEqual(TesState.COMPLETEEnum, tesTask.State); + Assert.AreEqual(2, tesTask.GetOrAddTesTaskLog().CromwellResultCode); + Assert.AreEqual(2, tesTask.CromwellResultCode); + }); + } + + [TestMethod] + public async Task TesInputFilePathMustStartWithCromwellExecutions() + { + var tesTask = GetTesTask(); + + tesTask.Inputs.Add(new() + { + Path = "xyz/path" + }); + + (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask); + + GuardAssertsWithTesTask(tesTask, () => + { + Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); + Assert.AreEqual($"InvalidInputFilePath", failureReason); + Assert.AreEqual($"InvalidInputFilePath", systemLog[0]); + Assert.AreEqual($"Unsupported input path 'xyz/path' for task Id {tesTask.Id}. Must start with '/'.", systemLog[1]); + }); + } + + [TestMethod] + public async Task TesInputFileMustHaveEitherUrlOrContent() + { + var tesTask = GetTesTask(); + + tesTask.Inputs.Add(new() + { + Url = null, + Content = null, + Path = "/file1.txt" + }); + + (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask); + + GuardAssertsWithTesTask(tesTask, () => + { + Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); + Assert.AreEqual($"InvalidInputFilePath", failureReason); + Assert.AreEqual($"InvalidInputFilePath", systemLog[0]); + Assert.AreEqual($"One of Input Url or Content must be set", systemLog[1]); + }); + } + + [TestMethod] + public async Task TesInputFileMustNotHaveBothUrlAndContent() + { + var tesTask = GetTesTask(); + + tesTask.Inputs.Add(new() + { + Url = "/storageaccount1/container1/file1.txt", + Content = "test content", + Path = "/file1.txt" + }); + + (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask); + + GuardAssertsWithTesTask(tesTask, () => + { + Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); + Assert.AreEqual($"InvalidInputFilePath", failureReason); + Assert.AreEqual($"InvalidInputFilePath", systemLog[0]); + Assert.AreEqual($"Input Url and Content cannot be both set", systemLog[1]); + }); + } + + [TestMethod] + public async Task TesInputFileTypeMustNotBeDirectory() + { + var tesTask = GetTesTask(); + + tesTask.Inputs.Add(new() + { + Url = "/storageaccount1/container1/directory", + Type = TesFileType.DIRECTORYEnum, + Path = "/directory" + }); + + (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask); + + GuardAssertsWithTesTask(tesTask, () => + { + Assert.AreEqual(TesState.SYSTEMERROREnum, tesTask.State); + Assert.AreEqual($"InvalidInputFilePath", failureReason); + Assert.AreEqual($"InvalidInputFilePath", systemLog[0]); + Assert.AreEqual($"Directory input is not supported.", systemLog[1]); + }); + } [TestMethod] public async Task QueryStringsAreRemovedFromLocalFilePathsWhenCommandScriptIsProvidedAsFile() @@ -1146,7 +1146,6 @@ public async Task QueryStringsAreRemovedFromLocalFilePathsWhenCommandScriptIsPro }); } - // TODO: Is this test still valid? [TestMethod] public async Task PublicHttpUrlsAreKeptIntact() { @@ -1183,7 +1182,6 @@ public async Task PublicHttpUrlsAreKeptIntact() }); } - // TODO: Is this test still valid? [TestMethod] public async Task PrivatePathsAndUrlsGetSasToken() { @@ -1506,15 +1504,15 @@ public async Task Assert.AreEqual(expectedPath, destination.Container.Path); } - //private static async Task<(string FailureReason, string[] SystemLog)> ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(TesTask tesTask, AzureBatchJobAndTaskState? azureBatchJobAndTaskState = null) - //{ - // var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - // azureProxyReturnValues.BatchJobAndTaskState = azureBatchJobAndTaskState ?? azureProxyReturnValues.BatchJobAndTaskState; + private static async Task<(string FailureReason, string[] SystemLog)> ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(TesTask tesTask, AzureBatchTaskState azureBatchTaskState = null) + { + var azureProxyReturnValues = AzureProxyReturnValues.Defaults; + azureProxyReturnValues.BatchTaskState = azureBatchTaskState ?? azureProxyReturnValues.BatchTaskState; - // _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); - // return (tesTask.Logs?.LastOrDefault()?.FailureReason, tesTask.Logs?.LastOrDefault()?.SystemLogs?.ToArray()); - //} + return (tesTask.Logs?.LastOrDefault()?.FailureReason, tesTask.Logs?.LastOrDefault()?.SystemLogs?.ToArray()); + } private static Task<(string JobId, CloudTask CloudTask, PoolInformation PoolInformation, Pool batchModelsPool)> ProcessTesTaskAndGetBatchJobArgumentsAsync() => ProcessTesTaskAndGetBatchJobArgumentsAsync(GetTesTask(), GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); @@ -1532,29 +1530,14 @@ public async Task var batchScheduler = serviceProvider.GetT(); serviceProviderActions?.Invoke(serviceProvider); - //BatchJobAndTaskStates.TaskActive - //BatchJobAndTaskStates.TaskPreparing - //BatchJobAndTaskStates.TaskRunning - //BatchJobAndTaskStates.TaskCompletedSuccessfully - //BatchJobAndTaskStates.TaskFailed - //BatchJobAndTaskStates.JobNotFound - //BatchJobAndTaskStates.TaskNotFound - //BatchJobAndTaskStates.MoreThanOneJobFound - //BatchJobAndTaskStates.NodeAllocationFailed - //BatchJobAndTaskStates.NodePreempted - //BatchJobAndTaskStates.NodeDiskFull - //BatchJobAndTaskStates.ActiveJobWithMissingAutoPool - //BatchJobAndTaskStates.ImageDownloadFailed - - await foreach (var _ in tesTask.State switch - { - TesState.CANCELINGEnum => batchScheduler.ProcessCancelledTesTasksAsync(new[] { tesTask }, System.Threading.CancellationToken.None), - TesState.QUEUEDEnum => batchScheduler.ProcessQueuedTesTasksAsync(new[] { tesTask }, System.Threading.CancellationToken.None), - TesState.COMPLETEEnum => batchScheduler.ProcessCompletedTesTasksAsync(new[] { tesTask }, new[] { BatchPoolTests.GenerateTask(tesTask.PoolId, tesTask.Id, executionInfo: new(0, 0, startTime: DateTime.UtcNow - TimeSpan.FromMinutes(15), endTime: DateTime.UtcNow - TimeSpan.FromMinutes(5), exitCode: 0)) }, System.Threading.CancellationToken.None), - TesState.EXECUTORERROREnum => batchScheduler.ProcessCompletedTesTasksAsync(new[] { tesTask }, new[] { new CloudTask(tesTask.Id, "") }, System.Threading.CancellationToken.None), - TesState.SYSTEMERROREnum => batchScheduler.ProcessCompletedTesTasksAsync(new[] { tesTask }, new[] { new CloudTask(tesTask.Id, "") }, System.Threading.CancellationToken.None), - _ => batchScheduler.ProcessTerminatedTesTasksAsync(new[] { tesTask }, System.Threading.CancellationToken.None), - }) { } + if (azureProxyReturnValues.BatchTaskState is null) + { + await foreach (var _ in batchScheduler.ProcessQueuedTesTasksAsync(new[] { tesTask }, System.Threading.CancellationToken.None)) { } + } + else + { + await foreach (var _ in batchScheduler.ProcessTesTaskBatchStatesAsync(new[] { tesTask }, new[] { azureProxyReturnValues.BatchTaskState }, System.Threading.CancellationToken.None)) { } + } var createBatchPoolAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.CreateBatchPoolAsync)); var addBatchTaskAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.AddBatchTaskAsync)); @@ -1633,13 +1616,13 @@ private static async Task GetNewTesTaskStateAsync(TesTask tesTask, Azu return tesTask.State; } - private static Task GetNewTesTaskStateAsync(TesState currentTesTaskState/*, AzureBatchJobAndTaskState azureBatchJobAndTaskState*/) - => GetNewTesTaskStateAsync(new TesTask { Id = "test", State = currentTesTaskState, Executors = new() { new() { Image = "imageName1", Command = new() { "command" } } } }/*, azureBatchJobAndTaskState*/); + private static Task GetNewTesTaskStateAsync(TesState currentTesTaskState, AzureBatchTaskState batchTaskState = default) + => GetNewTesTaskStateAsync(new TesTask { Id = "test", State = currentTesTaskState, Executors = new() { new() { Image = "imageName1", Command = new() { "command" } } } }, batchTaskState); - private static Task GetNewTesTaskStateAsync(TesTask tesTask/*, AzureBatchJobAndTaskState? azureBatchJobAndTaskState = null*/) + private static Task GetNewTesTaskStateAsync(TesTask tesTask, AzureBatchTaskState batchTaskState = default) { var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - //azureProxyReturnValues.BatchJobAndTaskState = azureBatchJobAndTaskState ?? azureProxyReturnValues.BatchJobAndTaskState; + azureProxyReturnValues.BatchTaskState = batchTaskState ?? azureProxyReturnValues.BatchTaskState; return GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); } @@ -1800,7 +1783,7 @@ internal static void GuardAssertsWithTesTask(TesTask tesTask, Action assertBlock } catch (AssertFailedException) { - foreach (var log in tesTask.Logs) + foreach (var log in tesTask.Logs ?? Enumerable.Empty()) { Console.WriteLine("Task failure: State: {0}: FailureReason: {1} SystemLogs: {2}", tesTask.State, log.FailureReason, string.Join(Environment.NewLine, log.SystemLogs)); } @@ -1829,22 +1812,39 @@ internal static async ValueTask GuardAssertsWithTesTask(TesTask tesTask, Func new() { JobState = JobState.Active, TaskState = TaskState.Active }; - // public static AzureBatchJobAndTaskState TaskPreparing => new() { JobState = JobState.Active, TaskState = TaskState.Preparing }; - // public static AzureBatchJobAndTaskState TaskRunning => new() { JobState = JobState.Active, TaskState = TaskState.Running }; - // public static AzureBatchJobAndTaskState TaskCompletedSuccessfully => new() { JobState = JobState.Completed, TaskState = TaskState.Completed, TaskExitCode = 0 }; - // public static AzureBatchJobAndTaskState TaskFailed => new() { JobState = JobState.Completed, TaskState = TaskState.Completed, TaskExitCode = -1 }; - // public static AzureBatchJobAndTaskState JobNotFound => new() { JobState = null }; - // public static AzureBatchJobAndTaskState TaskNotFound => new() { JobState = JobState.Active, TaskState = null }; - // public static AzureBatchJobAndTaskState MoreThanOneJobFound => new() { MoreThanOneActiveJobOrTaskFound = true }; - // public static AzureBatchJobAndTaskState NodeAllocationFailed => new() { JobState = JobState.Active, NodeAllocationFailed = true }; - // public static AzureBatchJobAndTaskState NodePreempted => new() { JobState = JobState.Active, NodeState = ComputeNodeState.Preempted }; - // public static AzureBatchJobAndTaskState NodeDiskFull => new() { JobState = JobState.Active, NodeErrorCode = "DiskFull" }; - // public static AzureBatchJobAndTaskState ActiveJobWithMissingAutoPool => new() { ActiveJobWithMissingAutoPool = true }; - // public static AzureBatchJobAndTaskState ImageDownloadFailed => new() { JobState = JobState.Active, NodeErrorCode = "ContainerInvalidImage" }; - //} + + //BatchJobAndTaskStates.TaskActive + //BatchJobAndTaskStates.TaskPreparing + //BatchJobAndTaskStates.TaskRunning + //BatchJobAndTaskStates.TaskCompletedSuccessfully + //BatchJobAndTaskStates.TaskFailed + //BatchJobAndTaskStates.JobNotFound + //BatchJobAndTaskStates.TaskNotFound + //BatchJobAndTaskStates.MoreThanOneJobFound + //BatchJobAndTaskStates.NodeAllocationFailed + //BatchJobAndTaskStates.NodePreempted + //BatchJobAndTaskStates.NodeDiskFull + //BatchJobAndTaskStates.ActiveJobWithMissingAutoPool + //BatchJobAndTaskStates.ImageDownloadFailed + + private struct BatchTaskStates + { + public static AzureBatchTaskState TaskActive => default; + public static AzureBatchTaskState TaskPreparing => new(AzureBatchTaskState.TaskState.Initializing, CloudTaskCreationTime: DateTimeOffset.UtcNow); + public static AzureBatchTaskState TaskRunning => new(AzureBatchTaskState.TaskState.Running, CloudTaskCreationTime: DateTimeOffset.UtcNow - TimeSpan.FromMinutes(6)); + public static AzureBatchTaskState TaskCompletedSuccessfully => new(AzureBatchTaskState.TaskState.CompletedSuccessfully, BatchTaskExitCode: 0); + public static AzureBatchTaskState TaskFailed => new(AzureBatchTaskState.TaskState.CompletedWithErrors, BatchTaskExitCode: -1); + //public static AzureBatchTaskState JobNotFound => new() { JobState = null }; + //public static AzureBatchTaskState TaskNotFound => new() { JobState = JobState.Active, TaskState = null }; + //public static AzureBatchTaskState MoreThanOneJobFound => new() { MoreThanOneActiveJobOrTaskFound = true }; + public static AzureBatchTaskState NodeAllocationFailed => new(AzureBatchTaskState.TaskState.NodeAllocationFailed, Failure: new(AzureBatchTaskState.TaskState.NodeAllocationFailed.ToString(), new[] { "Error message." })); + public static AzureBatchTaskState NodePreempted => new(AzureBatchTaskState.TaskState.NodePreempted); + public static AzureBatchTaskState NodeDiskFull => new(AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, Failure: new("DiskFull", new[] { "Error message." })); + + public static AzureBatchTaskState Terminated => new(AzureBatchTaskState.TaskState.CompletedWithErrors, CloudTaskCreationTime: DateTimeOffset.UtcNow - TimeSpan.FromMinutes(12)); + //public static AzureBatchTaskState ActiveJobWithMissingAutoPool => new() { ActiveJobWithMissingAutoPool = true }; + //public static AzureBatchTaskState ImageDownloadFailed => new() { JobState = JobState.Active, NodeErrorCode = "ContainerInvalidImage" }; + } private class AzureProxyReturnValues { @@ -1859,7 +1859,7 @@ private class AzureProxyReturnValues public IEnumerable ActiveNodeCountByVmSize { get; set; } public int ActiveJobCount { get; set; } public int ActivePoolCount { get; set; } - //public AzureBatchJobAndTaskState BatchJobAndTaskState { get; set; } + public AzureBatchTaskState BatchTaskState { get; set; } public string StorageAccountKey { get; set; } public string DownloadedBlobContent { get; set; } public bool LocalFileExists { get; set; } @@ -1884,7 +1884,7 @@ private class AzureProxyReturnValues ActiveNodeCountByVmSize = new List(), ActiveJobCount = 0, ActivePoolCount = 0, - //BatchJobAndTaskState = BatchJobAndTaskStates.JobNotFound, + BatchTaskState = default, StorageAccountKey = "Key1", DownloadedBlobContent = string.Empty, LocalFileExists = true diff --git a/src/TesApi.Web/AzureBatchTaskState.cs b/src/TesApi.Web/AzureBatchTaskState.cs index 91dc69a4f..162059598 100644 --- a/src/TesApi.Web/AzureBatchTaskState.cs +++ b/src/TesApi.Web/AzureBatchTaskState.cs @@ -24,6 +24,11 @@ public record AzureBatchTaskState(TaskState State, IEnumerable Ou /// public enum TaskState { + /// + /// A request has been made for the task's cancellation. + /// + CancellationRequested, + /// /// The task has been assigned to a compute node, but is waiting for a /// required Job Preparation task to complete on the node. diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 865add3c5..d0a0d5ba0 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -257,6 +257,7 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc { case ComputeNodeState.Unusable: _logger.LogDebug("Found unusable node {NodeId}", node.Id); + // TODO: notify running tasks that task will switch nodes? break; case ComputeNodeState.StartTaskFailed: @@ -266,6 +267,7 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc case ComputeNodeState.Preempted: _logger.LogDebug("Found preempted node {NodeId}", node.Id); + // TODO: notify running tasks that task will switch nodes? Or, in the future, terminate the task? break; default: // Should never reach here. Skip. @@ -421,14 +423,6 @@ public async ValueTask CanBeDeleted(CancellationToken cancellationToken = return true; } - /// - public ResizeError PopNextResizeError() - => ResizeErrors.TryDequeue(out var resizeError) ? resizeError : default; - - /// - public TaskFailureInformation PopNextStartTaskFailure() - => StartTaskFailures.TryDequeue(out var failure) ? failure : default; - /// /// Service methods dispatcher. /// @@ -458,7 +452,7 @@ internal async ValueTask ServicePoolAsync(ServiceKind serviceKind, CancellationT } /// - public async ValueTask ServicePoolAsync(CancellationToken cancellationToken = default) + public async IAsyncEnumerable<(string taskId, AzureBatchTaskState)> ServicePoolAsync([EnumeratorCancellation] CancellationToken cancellationToken) { var exceptions = new List(); @@ -470,7 +464,12 @@ await PerformTask(ServicePoolAsync(ServiceKind.Rotate, cancellationToken), cance switch (exceptions.Count) { case 0: - return; + await foreach (var (id, state) in _azureProxy.ListTasksAsync(Pool.PoolId, new ODATADetailLevel { FilterClause = "id", SelectClause = "state eq active" }).Select(cloud => cloud.Id).Zip(GetFailures(cancellationToken), (id, state) => (id, state)).WithCancellation(cancellationToken)) + { + yield return (id, state); + } + + yield break; case 1: throw exceptions.First(); @@ -504,6 +503,38 @@ async ValueTask PerformTask(ValueTask serviceAction, CancellationToken can return false; } + +#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously + async IAsyncEnumerable GetFailures([EnumeratorCancellation] CancellationToken cancellationToken) +#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + { + for (var failure = PopNextStartTaskFailure(); failure is not null; failure = PopNextStartTaskFailure()) + { + yield return ConvertFromStartTask(failure); + } + + for (var failure = PopNextResizeError(); failure is not null; failure = PopNextResizeError()) + { + yield return ConvertFromResize(failure); + } + } + + AzureBatchTaskState ConvertFromResize(ResizeError failure) + => new(AzureBatchTaskState.TaskState.NodeAllocationFailed, Failure: new(failure.Code, Enumerable.Empty() + .Append(failure.Message) + .Concat(failure.Values.Select(t => t.Value)))); + + AzureBatchTaskState ConvertFromStartTask(TaskFailureInformation failure) + => new(AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, Failure: new(failure.Code, Enumerable.Empty() + .Append(failure.Message) + .Append($"Start task failed ({failure.Category})") + .Concat(failure.Details.Select(t => t.Value)))); + + ResizeError PopNextResizeError() + => ResizeErrors.TryDequeue(out var resizeError) ? resizeError : default; + + TaskFailureInformation PopNextStartTaskFailure() + => StartTaskFailures.TryDequeue(out var failure) ? failure : default; } /// @@ -512,6 +543,7 @@ public IAsyncEnumerable GetCompletedTasks(CancellationToken _1) return _azureProxy.ListTasksAsync(Pool.PoolId, new ODATADetailLevel { FilterClause = "id,executionInfo", SelectClause = "state eq completed" }); } + // Returns false when pool/job was removed because it was not found. Returns true if the error was completely something else. private async ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken cancellationToken) { switch (ex) @@ -525,12 +557,12 @@ private async ValueTask RemoveMissingPoolsAsync(Exception ex, Cancellation return result; case BatchException batchException: - if (batchException.RequestInformation.BatchError.Code == BatchErrorCodeStrings.PoolNotFound) + if (batchException.RequestInformation.BatchError.Code == BatchErrorCodeStrings.PoolNotFound || + batchException.RequestInformation.BatchError.Code == BatchErrorCodeStrings.JobNotFound) { - _logger.LogError(ex, "Batch pool {PoolId} is missing. Removing it from TES's active pool list.", Pool.PoolId); + _logger.LogError(ex, "Batch pool and/or job {PoolId} is missing. Removing them from TES's active pool list.", Pool.PoolId); _ = _batchPools.RemovePoolFromList(this); - // TODO: Consider moving any remaining tasks to another pool, or failing tasks explicitly - await _batchPools.DeletePoolAsync(this, cancellationToken); // Ensure job removal too + await _batchPools.DeletePoolAsync(this, cancellationToken); return false; } break; diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index 7174fd7ee..a1522b110 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -2,6 +2,7 @@ // Licensed under the MIT License. using System; +using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Runtime.CompilerServices; @@ -24,7 +25,7 @@ internal class BatchPoolService : OrchestrateOnBatchSchedulerService /// /// Interval between each call to . /// - public static readonly TimeSpan RunInterval = TimeSpan.FromSeconds(30); + public static readonly TimeSpan RunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs uses a 30 second polling interval /// /// Default constructor @@ -98,9 +99,31 @@ private Task ServiceBatchPoolsAsync(CancellationToken stoppingToken) /// /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. /// - private ValueTask ExecuteServiceBatchPoolsAsync(CancellationToken stoppingToken) + private async ValueTask ExecuteServiceBatchPoolsAsync(CancellationToken stoppingToken) { - return ExecuteActionOnPoolsAsync("ServiceBatchPools", (pool, token) => pool.ServicePoolAsync(token), stoppingToken); + var list = new ConcurrentBag<(TesTask TesTask, AzureBatchTaskState State)>(); + + await ExecuteActionOnPoolsAsync("ServiceBatchPools", (pool, token) => ProcessFailures(pool.ServicePoolAsync(token), token), stoppingToken); + + await OrchestrateTesTasksOnBatchAsync( + "Failures", +#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously + async token => list.Select(t => t.TesTask).ToAsyncEnumerable(), +#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, list.Select(t => t.State).ToArray(), token), + stoppingToken); + + async ValueTask ProcessFailures(IAsyncEnumerable<(string taskId, AzureBatchTaskState)> failures, CancellationToken cancellationToken) + { + await foreach (var (id, state) in failures.WithCancellation(cancellationToken)) + { + TesTask tesTask = default; + if (await repository.TryGetItemAsync(id, cancellationToken, task => tesTask = task) && tesTask is not null) + { + list.Add((tesTask, state)); + } + } + } } /// @@ -128,7 +151,7 @@ await OrchestrateTesTasksOnBatchAsync( #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously async token => GetTesTasks(token), #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously - (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, tasks.Select(GetBatchState).ToArray(), token), + (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, tasks.Select(GetCompletedBatchState).ToArray(), token), stoppingToken); async IAsyncEnumerable GetTesTasks([EnumeratorCancellation] CancellationToken cancellationToken) @@ -143,11 +166,18 @@ async IAsyncEnumerable GetTesTasks([EnumeratorCancellation] Cancellatio } } - AzureBatchTaskState GetBatchState(CloudTask task) + AzureBatchTaskState GetCompletedBatchState(CloudTask task) { - if (task.ExecutionInformation.ExitCode != 0 || task.ExecutionInformation.FailureInformation is not null) + return task.ExecutionInformation.Result switch { - return new(AzureBatchTaskState.TaskState.CompletedWithErrors, + Microsoft.Azure.Batch.Common.TaskExecutionResult.Success => new( + AzureBatchTaskState.TaskState.CompletedSuccessfully, + BatchTaskStartTime: task.ExecutionInformation.StartTime, + BatchTaskEndTime: task.ExecutionInformation.EndTime, + BatchTaskExitCode: task.ExecutionInformation.ExitCode), + + Microsoft.Azure.Batch.Common.TaskExecutionResult.Failure => new( + AzureBatchTaskState.TaskState.CompletedWithErrors, Failure: new(task.ExecutionInformation.FailureInformation.Code, Enumerable.Empty() .Append(task.ExecutionInformation.FailureInformation.Message) @@ -155,15 +185,10 @@ AzureBatchTaskState GetBatchState(CloudTask task) .Concat(task.ExecutionInformation.FailureInformation.Details.Select(pair => pair.Value))), BatchTaskStartTime: task.ExecutionInformation.StartTime, BatchTaskEndTime: task.ExecutionInformation.EndTime, - BatchTaskExitCode: task.ExecutionInformation.ExitCode); - } - else - { - return new(AzureBatchTaskState.TaskState.CompletedSuccessfully, - BatchTaskStartTime: task.ExecutionInformation.StartTime, - BatchTaskEndTime: task.ExecutionInformation.EndTime, - BatchTaskExitCode: task.ExecutionInformation.ExitCode); - } + BatchTaskExitCode: task.ExecutionInformation.ExitCode), + + _ => throw new InvalidOperationException(), + }; } } } diff --git a/src/TesApi.Web/BatchScheduler.BatchPools.cs b/src/TesApi.Web/BatchScheduler.BatchPools.cs index 1bd1292bd..82ea9afe0 100644 --- a/src/TesApi.Web/BatchScheduler.BatchPools.cs +++ b/src/TesApi.Web/BatchScheduler.BatchPools.cs @@ -203,6 +203,7 @@ public async ValueTask FlushPoolsAsync(IEnumerable assignedPools, Cancel /// public Task DeletePoolAsync(IBatchPool pool, CancellationToken cancellationToken) { + // TODO: Consider moving any remaining tasks to another pool, or failing tasks explicitly logger.LogDebug(@"Deleting pool and job {PoolId}", pool.Pool.PoolId); return Task.WhenAll( diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 1d6a0f8f0..600e14ef3 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -186,7 +186,7 @@ public BatchScheduler( static bool tesTaskCancellationRequested(TesTask tesTask) => tesTask.State == TesState.CANCELINGEnum; static bool tesTaskDeletionReady(TesTask tesTask) => tesTask.IsTaskDeletionRequired; - async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) + async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { var metrics = newTaskState == TesState.COMPLETEEnum ? await GetBatchNodeMetricsAndCromwellResultCodeAsync(tesTask, cancellationToken) @@ -216,52 +216,54 @@ async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, AzureBatch { tesTask.AddToSystemLog(batchInfo.Failure.SystemLogs); } - //else if (!string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem)) - //{ - // tesTask.AddToSystemLog(new[] { batchInfo.AlternateSystemLogItem }); - //} + else if (!string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem)) + { + tesTask.AddToSystemLog(new[] { batchInfo.AlternateSystemLogItem }); + } } - } - async Task SetTaskCompleted(TesTask tesTask, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) - { - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - await SetTaskStateAndLog(tesTask, TesState.COMPLETEEnum, batchInfo, cancellationToken); + return true; } - async Task SetTaskExecutorError(TesTask tesTask, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) + async Task SetTaskCompleted(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - await SetTaskStateAndLog(tesTask, TesState.EXECUTORERROREnum, batchInfo, cancellationToken); + return await SetTaskStateAndLog(tesTask, TesState.COMPLETEEnum, batchInfo, cancellationToken); } - async Task SetTaskSystemError(TesTask tesTask, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) + async Task SetTaskExecutorError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - await SetTaskStateAndLog(tesTask, TesState.SYSTEMERROREnum, batchInfo, cancellationToken); + return await SetTaskStateAndLog(tesTask, TesState.EXECUTORERROREnum, batchInfo, cancellationToken); } - async Task DeleteBatchJobAndSetTaskStateAsync(TesTask tesTask, TesState newTaskState, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) + //async Task SetTaskSystemError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) + //{ + // await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + // return await SetTaskStateAndLog(tesTask, TesState.SYSTEMERROREnum, batchInfo, cancellationToken); + //} + + async Task DeleteBatchTaskAndSetTaskStateAsync(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - await SetTaskStateAndLog(tesTask, newTaskState, batchInfo, cancellationToken); + return await SetTaskStateAndLog(tesTask, newTaskState, batchInfo, cancellationToken); } - Task DeleteBatchJobAndSetTaskExecutorErrorAsync(TesTask tesTask, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) => DeleteBatchJobAndSetTaskStateAsync(tesTask, TesState.EXECUTORERROREnum, batchInfo, cancellationToken); - Task DeleteBatchJobAndSetTaskSystemErrorAsync(TesTask tesTask, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) => DeleteBatchJobAndSetTaskStateAsync(tesTask, TesState.SYSTEMERROREnum, batchInfo, cancellationToken); + Task DeleteBatchTaskAndSetTaskExecutorErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) => DeleteBatchTaskAndSetTaskStateAsync(tesTask, TesState.EXECUTORERROREnum, batchInfo, cancellationToken); + Task DeleteBatchTaskAndSetTaskSystemErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) => DeleteBatchTaskAndSetTaskStateAsync(tesTask, TesState.SYSTEMERROREnum, batchInfo, cancellationToken); - Task DeleteBatchJobAndRequeueTaskAsync(TesTask tesTask, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) + Task DeleteBatchTaskAndRequeueTaskAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) => ++tesTask.ErrorCount > 3 - ? AddSystemLogAndDeleteBatchJobAndSetTaskExecutorErrorAsync(tesTask, batchInfo, "System Error: Retry count exceeded.", cancellationToken) - : DeleteBatchJobAndSetTaskStateAsync(tesTask, TesState.QUEUEDEnum, batchInfo, cancellationToken); + ? AddSystemLogAndDeleteBatchTaskAndSetTaskExecutorErrorAsync(tesTask, batchInfo, "System Error: Retry count exceeded.", cancellationToken) + : DeleteBatchTaskAndSetTaskStateAsync(tesTask, TesState.QUEUEDEnum, batchInfo, cancellationToken); - Task AddSystemLogAndDeleteBatchJobAndSetTaskExecutorErrorAsync(TesTask tesTask, AzureBatchTaskState batchInfo, string alternateSystemLogItem, CancellationToken cancellationToken) + Task AddSystemLogAndDeleteBatchTaskAndSetTaskExecutorErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, string alternateSystemLogItem, CancellationToken cancellationToken) { - batchInfo = new(batchInfo.State, batchInfo.OutputFileLogs, new(batchInfo.Failure.Reason, (batchInfo.Failure.SystemLogs ?? Enumerable.Empty()).Append(alternateSystemLogItem)), batchInfo.CloudTaskCreationTime, batchInfo.BatchTaskStartTime, batchInfo.BatchTaskEndTime, batchInfo.BatchTaskExitCode); - return DeleteBatchJobAndSetTaskExecutorErrorAsync(tesTask, batchInfo, cancellationToken); + batchInfo = new(new(batchInfo.State, batchInfo.OutputFileLogs, new(batchInfo.Failure.Reason, (batchInfo.Failure.SystemLogs ?? Enumerable.Empty()).Append(alternateSystemLogItem)), batchInfo.CloudTaskCreationTime, batchInfo.BatchTaskStartTime, batchInfo.BatchTaskEndTime, batchInfo.BatchTaskExitCode), batchInfo.AlternateSystemLogItem); + return DeleteBatchTaskAndSetTaskExecutorErrorAsync(tesTask, batchInfo, cancellationToken); } - Task HandlePreemptedNodeAsync(TesTask tesTask, AzureBatchTaskState batchInfo, CancellationToken cancellationToken) + Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { logger.LogInformation("The TesTask {TesTask}'s node was preempted. It will be automatically rescheduled.", tesTask.Id); tesTask.State = TesState.INITIALIZINGEnum; @@ -270,18 +272,19 @@ Task HandlePreemptedNodeAsync(TesTask tesTask, AzureBatchTaskState batchInfo, Ca tesTaskStateTransitions = new List() { - //new TesTaskStateTransition(tesTaskCancellationRequested, batchTaskState: null, alternateSystemLogItem: null, CancelTaskAsync), + new TesTaskStateTransition(tesTaskDeletionReady, batchTaskState: null, alternateSystemLogItem: null, (tesTask, _, ct) => DeleteCancelledTaskAsync(tesTask, ct)), + new TesTaskStateTransition(tesTaskCancellationRequested, batchTaskState: null, alternateSystemLogItem: null, (tesTask, _, ct) => TerminateBatchTaskAsync(tesTask, ct)), //new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.JobNotFound, alternateSystemLogItem: null, (tesTask, _, ct) => AddBatchTaskAsync(tesTask, ct)), //new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.MissingBatchTask, alternateSystemLogItem: null, (tesTask, _, ct) => AddBatchTaskAsync(tesTask, ct)), - //new TesTaskStateTransition(tesTaskIsQueued, AzureBatchTaskState.TaskState.Initializing, alternateSystemLogItem: null, (tesTask, _) => tesTask.State = TesState.INITIALIZINGEnum), - new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, AzureBatchTaskState.TaskState.NodeAllocationFailed, alternateSystemLogItem: null, DeleteBatchJobAndRequeueTaskAsync), - new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, AzureBatchTaskState.TaskState.Running, alternateSystemLogItem: null, (tesTask, _) => tesTask.State = TesState.RUNNINGEnum), + new TesTaskStateTransition(tesTaskIsQueued, AzureBatchTaskState.TaskState.Initializing, alternateSystemLogItem: null, (tesTask, _) => { tesTask.State = TesState.INITIALIZINGEnum; return true; }), + new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, AzureBatchTaskState.TaskState.NodeAllocationFailed, alternateSystemLogItem: null, DeleteBatchTaskAndRequeueTaskAsync), + new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, AzureBatchTaskState.TaskState.Running, alternateSystemLogItem: null, (tesTask, _) => { tesTask.State = TesState.RUNNINGEnum; return true; }), //new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.MoreThanOneActiveJobOrTaskFound, BatchTaskState.MoreThanOneActiveJobOrTaskFound.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedSuccessfully, alternateSystemLogItem: null, SetTaskCompleted), new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedWithErrors, "Please open an issue. There should have been an error reported here.", SetTaskExecutorError), //new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.ActiveJobWithMissingAutoPool, alternateSystemLogItem: null, DeleteBatchJobAndRequeueTaskAsync), - new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, "Please open an issue. There should have been an error reported here.", DeleteBatchJobAndSetTaskExecutorErrorAsync), - new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.NodeUnusable, "Please open an issue. There should have been an error reported here.", DeleteBatchJobAndSetTaskExecutorErrorAsync), + new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, "Please open an issue. There should have been an error reported here.", DeleteBatchTaskAndSetTaskSystemErrorAsync), + new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.NodeUnusable, "Please open an issue. There should have been an error reported here.", DeleteBatchTaskAndSetTaskExecutorErrorAsync), //new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.JobNotFound, BatchTaskState.JobNotFound.ToString(), SetTaskSystemError), //new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.MissingBatchTask, BatchTaskState.MissingBatchTask.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodePreempted, alternateSystemLogItem: null, HandlePreemptedNodeAsync) @@ -372,113 +375,6 @@ public async Task UploadTaskRunnerIfNeeded(CancellationToken cancellationToken) } } - ///// - //public async IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessCompletedTesTasksAsync(IEnumerable tesTasks, CloudTask[] cloudTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) - //{ - // ArgumentNullException.ThrowIfNull(tesTasks); - // ArgumentNullException.ThrowIfNull(cloudTasks); - - // foreach (var (cloudTask, tesTask) in cloudTasks.Zip(tesTasks)) - // { - // // Skip if this task was previously updated. - // if (tesTask.State != TesState.COMPLETEEnum && tesTask.State != TesState.SYSTEMERROREnum && tesTask.State != TesState.EXECUTORERROREnum) - // { - // try - // { - // await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - // } - // catch (Exception ex) - // { - // logger.LogError(ex, @"Failure when deleting cloud task for TES task: {TesTask} Exception: {ExceptionType}: {ExceptionMessage}", tesTask.Id, ex.GetType().FullName, ex.Message); - // tesTask.IsTaskDeletionRequired = true; - // } - - // if (cloudTask.ExecutionInformation.ExitCode != 0 || cloudTask.ExecutionInformation.FailureInformation is not null) - // { - // await SetTaskStateAndLog(tesTask, TesState.EXECUTORERROREnum, cloudTask); - // } - // else - // { - // await SetTaskStateAndLog(tesTask, TesState.COMPLETEEnum, cloudTask); - // } - - // yield return (tesTask, Task.FromResult(true)); - // } - // } - - // async ValueTask SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CloudTask cloudTask) - // { - // var metrics = newTaskState == TesState.COMPLETEEnum - // ? await GetBatchNodeMetricsAndCromwellResultCodeAsync(tesTask, cancellationToken) - // : default; - - // tesTask.State = newTaskState; - - // var tesTaskLog = tesTask.GetOrAddTesTaskLog(); - // var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); - - // tesTaskLog.BatchNodeMetrics = metrics.BatchNodeMetrics; - // tesTaskLog.CromwellResultCode = metrics.CromwellRcCode; - // tesTaskLog.EndTime = DateTime.UtcNow; - // tesTaskExecutorLog.StartTime = metrics.TaskStartTime ?? cloudTask.ExecutionInformation?.StartTime; - // tesTaskExecutorLog.EndTime = metrics.TaskEndTime ?? cloudTask.ExecutionInformation?.EndTime; - // tesTaskExecutorLog.ExitCode = cloudTask.ExecutionInformation?.ExitCode; - - // // Only accurate when the task completes successfully, otherwise it's the Batch time as reported from Batch - // // TODO this could get large; why? - // //var timefromCoAScriptCompletionToBatchTaskDetectedComplete = tesTaskLog.EndTime - tesTaskExecutorLog.EndTime; - - // if (newTaskState != TesState.COMPLETEEnum) - // { - // tesTask.SetFailureReason(cloudTask.ExecutionInformation?.FailureInformation?.Code); - // tesTask.AddToSystemLog(Enumerable.Empty() - // .Append($"Batch task ExitCode: {cloudTask.ExecutionInformation?.ExitCode}, Failure message: {cloudTask.ExecutionInformation?.FailureInformation?.Message}") - // .Concat(cloudTask.ExecutionInformation?.FailureInformation?.Details?.Select(d => $"{d.Name}: {d.Value}") ?? Enumerable.Empty())); - // } - // } - //} - - /// - public async IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessCancelledTesTasksAsync(IEnumerable tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) - { - foreach (var task in tesTasks) - { - Task result; - - try - { - result = Task.FromResult(await TerminateBatchTaskAsync(task, cancellationToken)); - } - catch (Exception ex) - { - result = Task.FromException(ex); - } - - yield return (task, result); - } - } - - /// - public async IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessTerminatedTesTasksAsync(IEnumerable tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) - { - foreach (var task in tesTasks) - { - Task result; - - try - { - result = Task.FromResult(await DeleteCancelledTaskAsync(task, cancellationToken)); - } - catch (Exception ex) - { - result = Task.FromException(ex); - } - - yield return (task, result); - } - } - - private static string GetCromwellExecutionDirectoryPathAsUrl(TesTask task) { var commandScript = task.Inputs?.FirstOrDefault(IsCromwellCommandScript); @@ -564,12 +460,12 @@ private bool TryGetCromwellTmpFilePath(string fileUri, out string localPath) /// public async IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { - var taskMetadataListByPoolKey = new Dictionary>(); + var tasksMetadataByPoolKey = new Dictionary>(); var poolKeyByTaskIds = new Dictionary(); var tasks = tesTasks.ToList(); // Determine how many nodes in each pool we might need for this group. - foreach (var tesTask in tesTasks) + foreach (var tesTask in tesTasks) // TODO: Consider parallelizing this foreach loop { Task quickResult = default; string poolKey = default; @@ -581,13 +477,13 @@ private bool TryGetCromwellTmpFilePath(string fileUri, out string localPath) (poolKey, var displayName) = GetPoolKey(tesTask, virtualMachineInfo, containerMetadata.ContainerConfiguration, cancellationToken); await quotaVerifier.CheckBatchAccountQuotasAsync(virtualMachineInfo, needPoolOrJobQuotaCheck: !IsPoolAvailable(poolKey), cancellationToken: cancellationToken); - if (taskMetadataListByPoolKey.TryGetValue(poolKey, out var resource)) + if (tasksMetadataByPoolKey.TryGetValue(poolKey, out var resource)) { resource.Add((tesTask, virtualMachineInfo, containerMetadata, displayName)); } else { - taskMetadataListByPoolKey.Add(poolKey, new() { (tesTask, virtualMachineInfo, containerMetadata, displayName) }); + tasksMetadataByPoolKey.Add(poolKey, new() { (tesTask, virtualMachineInfo, containerMetadata, displayName) }); } poolKeyByTaskIds.Add(tesTask.Id, poolKey); @@ -605,39 +501,40 @@ private bool TryGetCromwellTmpFilePath(string fileUri, out string localPath) } // Remove already returned tasks from the dictionary - taskMetadataListByPoolKey = taskMetadataListByPoolKey + tasksMetadataByPoolKey = tasksMetadataByPoolKey .Select(p => (p.Key, Value: p.Value.Where(v => tasks.Contains(v.TesTask)).ToList())) // keep only tasks that remain in the tasks variable .Where(t => t.Value.Count != 0) // Remove any empty pool keys .ToDictionary(p => p.Key, p => p.Value); // Determine how many nodes in each new pool we might need for this group. - var neededPoolNodesByPoolKey = taskMetadataListByPoolKey.ToDictionary(t => t.Key, t => t.Value.Count); + var neededPoolNodesByPoolKey = tasksMetadataByPoolKey.ToDictionary(t => t.Key, t => t.Value.Count); // Determine how many new pools/jobs we will need for this batch - var requiredNewPools = neededPoolNodesByPoolKey.Where(t => !IsPoolAvailable(t.Key)).Sum(t => t.Value); + var requiredNewPools = neededPoolNodesByPoolKey.Where(t => !IsPoolAvailable(t.Key)).Count(); // Revisit pool/job quotas (the above loop already dealt with the possiblility of needing just one more pool/job) if (requiredNewPools > 1) { var (excess, exception) = await quotaVerifier.CheckBatchAccountPoolOrJobQuotasAsync(requiredNewPools, cancellationToken); - var initial = taskMetadataListByPoolKey.Count - 1; + var initial = tasksMetadataByPoolKey.Count - 1; var final = initial - excess; for (var i = initial; i > final; --i) { - var key = taskMetadataListByPoolKey.Keys.ElementAt(i); - if (taskMetadataListByPoolKey.Remove(key, out var listOfTaskMetadata)) + var key = tasksMetadataByPoolKey.Keys.ElementAt(i); + if (tasksMetadataByPoolKey.Remove(key, out var listOfTaskMetadata)) { foreach (var (task, _, _, _) in listOfTaskMetadata) { - yield return (task, Task.FromException(exception)); + tasks.Remove(task); + yield return (task, HandleException(exception, key, task)); } } } } // Obtain assigned pool and create and assign the cloudtask for each task. - foreach (var (tesTask, virtualMachineInfo, containerMetadata, displayName) in taskMetadataListByPoolKey.Values.SelectMany(e => e)) + foreach (var (tesTask, virtualMachineInfo, containerMetadata, displayName) in tasksMetadataByPoolKey.Values.SelectMany(e => e)) { Task quickResult = default; var poolKey = poolKeyByTaskIds[tesTask.Id]; @@ -1049,9 +946,9 @@ Task HandleException(Exception exception, string poolKey, TesTask tesTask) ArgumentNullException.ThrowIfNull(taskStates); return taskStates.Zip(tesTasks, (TaskState, TesTask) => (TaskState, TesTask)) - .Where(entry => entry.TesTask.IsActiveState()) - .Select(entry => (entry.TesTask, HandleTesTaskTransitionAsync(entry.TesTask, entry.TaskState, cancellationToken).AsTask())) - .WhenEach(cancellationToken, tuple => tuple.Item2); + .Where(entry => entry.TesTask.IsActiveState()) // Removes already terminal TesTasks + .Select(entry => (entry.TesTask, IsModifiedAsync: HandleTesTaskTransitionAsync(entry.TesTask, entry.TaskState, cancellationToken).AsTask())) + .WhenEach(cancellationToken, tuple => tuple.IsModifiedAsync); } /// @@ -1072,9 +969,9 @@ Task HandleException(Exception exception, string poolKey, TesTask tesTask) // tesTask.Log[].Log[].StartTime // tesTask.Log[].Log[].EndTime private ValueTask HandleTesTaskTransitionAsync(TesTask tesTask, AzureBatchTaskState azureBatchTaskState, CancellationToken cancellationToken) - => (tesTaskStateTransitions + => tesTaskStateTransitions .FirstOrDefault(m => (m.Condition is null || m.Condition(tesTask)) && (m.CurrentBatchTaskState is null || m.CurrentBatchTaskState == azureBatchTaskState.State)) - ?.ActionAsync(tesTask, azureBatchTaskState, cancellationToken) ?? ValueTask.FromResult(false)); + ?.ActionAsync(tesTask, azureBatchTaskState, cancellationToken) ?? ValueTask.FromResult(false); /// /// Returns job preparation and main Batch tasks that represents the given @@ -2052,16 +1949,16 @@ private static Dictionary DelimitedTextToDictionary(string text, /// private class TesTaskStateTransition { - public TesTaskStateTransition(Func condition, AzureBatchTaskState.TaskState? batchTaskState, string alternateSystemLogItem, Func asyncAction) + public TesTaskStateTransition(Predicate condition, AzureBatchTaskState.TaskState? batchTaskState, string alternateSystemLogItem, Func> asyncAction) : this(condition, batchTaskState, alternateSystemLogItem, asyncAction, null) { } - public TesTaskStateTransition(Func condition, AzureBatchTaskState.TaskState? batchTaskState, string alternateSystemLogItem, Action action) + public TesTaskStateTransition(Predicate condition, AzureBatchTaskState.TaskState? batchTaskState, string alternateSystemLogItem, Func action) : this(condition, batchTaskState, alternateSystemLogItem, null, action) { } - private TesTaskStateTransition(Func condition, AzureBatchTaskState.TaskState? batchTaskState, string alternateSystemLogItem, Func asyncAction, Action action) + private TesTaskStateTransition(Predicate condition, AzureBatchTaskState.TaskState? batchTaskState, string alternateSystemLogItem, Func> asyncAction, Func action) { Condition = condition; CurrentBatchTaskState = batchTaskState; @@ -2070,28 +1967,27 @@ private TesTaskStateTransition(Func condition, AzureBatchTaskStat Action = action; } - public Func Condition { get; } + public Predicate Condition { get; } public AzureBatchTaskState.TaskState? CurrentBatchTaskState { get; } private string AlternateSystemLogItem { get; } - private Func AsyncAction { get; } - private Action Action { get; } + private Func> AsyncAction { get; } + private Func Action { get; } /// /// Calls and/or . /// /// - /// + /// /// A for controlling the lifetime of the asynchronous operation. /// True an action was called, otherwise False. - public async ValueTask ActionAsync(TesTask tesTask, AzureBatchTaskState combinedBatchTaskInfo, CancellationToken cancellationToken) + public async ValueTask ActionAsync(TesTask tesTask, AzureBatchTaskState batchState, CancellationToken cancellationToken) { - //combinedBatchTaskInfo.AlternateSystemLogItem = AlternateSystemLogItem; + CombinedBatchTaskInfo combinedBatchTaskInfo = new(batchState, AlternateSystemLogItem); var tesTaskChanged = false; if (AsyncAction is not null) { - await AsyncAction(tesTask, combinedBatchTaskInfo, cancellationToken); - tesTaskChanged = true; + tesTaskChanged = await AsyncAction(tesTask, combinedBatchTaskInfo, cancellationToken); } if (Action is not null) @@ -2104,19 +2000,16 @@ public async ValueTask ActionAsync(TesTask tesTask, AzureBatchTaskState co } } - //private class CombinedBatchTaskInfo - //{ - // public BatchTaskState BatchTaskState { get; set; } - // public BatchNodeMetrics BatchNodeMetrics { get; set; } - // public string FailureReason { get; set; } - // public DateTimeOffset? BatchTaskStartTime { get; set; } - // public DateTimeOffset? BatchTaskEndTime { get; set; } - // public int? BatchTaskExitCode { get; set; } - // public int? CromwellRcCode { get; set; } - // public IEnumerable SystemLogItems { get; set; } - // public PoolInformation Pool { get; set; } - // public string AlternateSystemLogItem { get; set; } - //} + private record CombinedBatchTaskInfo : AzureBatchTaskState + { + public CombinedBatchTaskInfo(AzureBatchTaskState state, string alternateSystemLogItem) + : base(state.State, state.OutputFileLogs, state.Failure, state.CloudTaskCreationTime, state.BatchTaskStartTime, state.BatchTaskEndTime, state.BatchTaskExitCode) + { + AlternateSystemLogItem = alternateSystemLogItem; + } + + public string AlternateSystemLogItem { get; } + } private class TesInputPathEqualityComparer : IEqualityComparer { diff --git a/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs b/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs index 0722bc298..b2f4c2d0f 100644 --- a/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs +++ b/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs @@ -2,7 +2,9 @@ // Licensed under the MIT License. using System; +using System.Collections; using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; using System.Linq; using System.Runtime.CompilerServices; using System.Threading; @@ -41,7 +43,7 @@ public static async IAsyncEnumerable WhenEach(this IEnumerable source, var list = source.Where(e => e is not null).Select(e => (Entry: e, Task: sourceToTask(e))).ToList(); var pendingCount = list.Count; - if (list.Select(e => e.Task).ToHashSet().Count != pendingCount) // Check for duplicate tasks + if (list.Select(e => e.Task).ToHashSet(new SystemTaskEqualityComparer()).Count != pendingCount) // Check for duplicate tasks { throw new ArgumentException("Duplicate System.Threading.Tasks found referenced in collection.", nameof(source)); } @@ -78,5 +80,14 @@ public static async IAsyncEnumerable WhenEach(this IEnumerable source, yield return entry; } } + + private class SystemTaskEqualityComparer : IEqualityComparer + { + public bool Equals(Task x, Task y) + => ReferenceEquals(x, y); + + public int GetHashCode([DisallowNull] Task obj) + => obj?.GetHashCode() ?? 0; + } } } diff --git a/src/TesApi.Web/IBatchPool.cs b/src/TesApi.Web/IBatchPool.cs index 6fb4b6ab5..f65db2545 100644 --- a/src/TesApi.Web/IBatchPool.cs +++ b/src/TesApi.Web/IBatchPool.cs @@ -45,25 +45,13 @@ public interface IBatchPool /// ValueTask CanBeDeleted(CancellationToken cancellationToken = default); - /// - /// Removes and returns the next available resize error. - /// - /// The first in the list, or null if the list is empty. - /// appears to contain two entries with containing respectively "code" & "message" - ResizeError PopNextResizeError(); - - /// - /// Removes and returns the next available start task failure. - /// - /// The first in the list, or null if the list is empty. - TaskFailureInformation PopNextStartTaskFailure(); - /// /// Updates this instance based on changes to its environment. /// /// /// Calls each internal servicing method in order. Throws all exceptions gathered from all methods. - ValueTask ServicePoolAsync(CancellationToken cancellationToken = default); + /// + IAsyncEnumerable<(string taskId, AzureBatchTaskState)> ServicePoolAsync(CancellationToken cancellationToken = default); /// /// Gets the last time the pool's compute node list was changed. diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index 604bbea4f..026d1dd1a 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -43,31 +43,6 @@ public interface IBatchScheduler /// True for each corresponding that needs to be persisted. IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, CancellationToken cancellationToken); - /// - /// Finalize completed s on a batch system - /// - /// s to schedule on the batch system. - /// s corresponding to each . - /// A for controlling the lifetime of the asynchronous operation. - /// True for each corresponding that needs to be persisted. - //IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessCompletedTesTasksAsync(IEnumerable tesTasks, CloudTask[] cloudTasks, CancellationToken cancellationToken); - - /// - /// Terminate cancelled s on a batch system - /// - /// s to schedule on the batch system. - /// A for controlling the lifetime of the asynchronous operation. - /// True for each corresponding that needs to be persisted. - IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessCancelledTesTasksAsync(IEnumerable tesTasks, CancellationToken cancellationToken); - - /// - /// Delete terminated s on a batch system - /// - /// s to schedule on the batch system. - /// A for controlling the lifetime of the asynchronous operation. - /// True for each corresponding that needs to be persisted. - IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessTerminatedTesTasksAsync(IEnumerable tesTasks, CancellationToken cancellationToken); - /// /// Assign pool and compute node errors to s on a batch system /// diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs index 3e16c77e5..c6c53b8c7 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs @@ -168,7 +168,6 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< if (exc is Microsoft.Azure.Batch.Common.BatchException batchException) { var requestInfo = batchException.RequestInformation; - //var requestId = batchException.RequestInformation?.ServiceRequestId; var reason = (batchException.InnerException as Microsoft.Azure.Batch.Protocol.Models.BatchErrorException)?.Response?.ReasonPhrase; var logs = new List(); @@ -181,6 +180,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< { logs.Add($"BatchErrorCode: {requestInfo.BatchError.Code}"); logs.Add($"BatchErrorMessage: {requestInfo.BatchError.Message}"); + foreach (var detail in requestInfo.BatchError.Values?.Select(d => $"{d.Key}={d.Value}") ?? Enumerable.Empty()) { logs.Add(detail); @@ -262,7 +262,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< logger.LogError(exc, "Updating TES Task '{TesTask}' threw {ExceptionType}: '{ExceptionMessage}'. Stack trace: {ExceptionStackTrace}", tesTask.Id, exc.GetType().FullName, exc.Message, exc.StackTrace); } - if (!string.IsNullOrWhiteSpace(tesTask.PoolId) && (TesState.QUEUEDEnum == tesTask.State || TesState.RUNNINGEnum == tesTask.State)) + if (!string.IsNullOrWhiteSpace(tesTask.PoolId) && (TesState.INITIALIZINGEnum == tesTask.State || TesState.RUNNINGEnum == tesTask.State)) { pools.Add(tesTask.PoolId); } diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index 3da48eb6f..a80abcd39 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -9,7 +9,6 @@ using Microsoft.Extensions.Logging; using Tes.Models; using Tes.Repository; -using YamlDotNet.Core.Tokens; namespace TesApi.Web { @@ -20,7 +19,8 @@ namespace TesApi.Web /// internal class Scheduler : OrchestrateOnBatchSchedulerService { - private readonly TimeSpan runInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs uses a 30 second polling interval + private readonly TimeSpan blobRunInterval = TimeSpan.FromSeconds(5); + private readonly TimeSpan batchRunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs uses a 30 second polling interval /// /// Default constructor @@ -78,7 +78,7 @@ private Task ExecuteQueuedTesTasksOnBatchAsync(CancellationToken stoppingToken) .OrderBy(t => t.CreationTime) .ToAsyncEnumerable()); - return ExecuteActionOnIntervalAsync(runInterval, + return ExecuteActionOnIntervalAsync(batchRunInterval, cancellationToken => OrchestrateTesTasksOnBatchAsync("Queued", query, batchScheduler.ProcessQueuedTesTasksAsync, cancellationToken), stoppingToken); } @@ -97,8 +97,15 @@ private Task ExecuteCancelledTesTasksOnBatchAsync(CancellationToken stoppingToke .OrderBy(t => t.CreationTime) .ToAsyncEnumerable()); - return ExecuteActionOnIntervalAsync(runInterval, - cancellationToken => OrchestrateTesTasksOnBatchAsync("Cancelled", query, batchScheduler.ProcessCancelledTesTasksAsync, cancellationToken), + return ExecuteActionOnIntervalAsync(batchRunInterval, + cancellationToken => OrchestrateTesTasksOnBatchAsync( + "Cancelled", + query, + (tasks, cancellationToken) => batchScheduler.ProcessTesTaskBatchStatesAsync( + tasks, + Enumerable.Repeat(new(AzureBatchTaskState.TaskState.CancellationRequested), tasks.Length).ToArray(), + cancellationToken), + cancellationToken), stoppingToken); } @@ -116,8 +123,15 @@ private Task ExecuteTerminatedTesTasksOnBatchAsync(CancellationToken stoppingTok .OrderBy(t => t.CreationTime) .ToAsyncEnumerable()); - return ExecuteActionOnIntervalAsync(runInterval, - cancellationToken => OrchestrateTesTasksOnBatchAsync("Terminated", query, batchScheduler.ProcessTerminatedTesTasksAsync, cancellationToken), + return ExecuteActionOnIntervalAsync(batchRunInterval, + cancellationToken => OrchestrateTesTasksOnBatchAsync( + "Terminated", + query, + (tasks, cancellationToken) => batchScheduler.ProcessTesTaskBatchStatesAsync( + tasks, + Enumerable.Repeat(new(AzureBatchTaskState.TaskState.CancellationRequested), tasks.Length).ToArray(), + cancellationToken), + cancellationToken), stoppingToken); } @@ -128,7 +142,7 @@ private Task ExecuteTerminatedTesTasksOnBatchAsync(CancellationToken stoppingTok /// private Task ExecuteUpdateTesTaskFromEventBlobAsync(CancellationToken stoppingToken) { - return ExecuteActionOnIntervalAsync(runInterval, + return ExecuteActionOnIntervalAsync(blobRunInterval, UpdateTesTasksFromEventBlobsAsync, stoppingToken); } From 8cfe810f0c5fa793112e20db091661c3c4fff186 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 3 Oct 2023 20:39:14 -0700 Subject: [PATCH 015/202] Fix formatting, difference readabiltiy --- src/TesApi.Tests/BatchSchedulerTests.cs | 37 ++- src/TesApi.Web/AzureProxy.cs | 137 --------- src/TesApi.Web/BatchScheduler.cs | 265 ++---------------- .../CachingWithRetriesAzureProxy.cs | 4 +- src/TesApi.Web/IAzureProxy.cs | 3 - src/TesApi.Web/IBatchScheduler.cs | 12 +- .../Management/BatchQuotaVerifier.cs | 5 + 7 files changed, 46 insertions(+), 417 deletions(-) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index b34688eb7..d956770d6 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -25,7 +25,6 @@ using TesApi.Web.Management; using TesApi.Web.Management.Models.Quotas; using TesApi.Web.Storage; -using ResourceFile = Microsoft.Azure.Batch.ResourceFile; namespace TesApi.Tests { @@ -297,11 +296,28 @@ public async Task TesTaskRemainsQueuedWhenBatchQuotaIsTemporarilyUnavailable() azureProxyReturnValues.BatchQuotas = new() { ActiveJobAndJobScheduleQuota = 1, PoolQuota = 1, DedicatedCoreQuota = 9, LowPriorityCoreQuota = 17 }; + azureProxyReturnValues.ActiveNodeCountByVmSize = new List { + new() { VirtualMachineSize = "VmSize1", DedicatedNodeCount = 4, LowPriorityNodeCount = 8 } // 8 (4 * 2) dedicated and 16 (8 * 2) low pri cores are in use, there is no more room for 2 cores + }; + + // The actual CPU core count (2) of the selected VM is used for quota calculation, not the TesResources CpuCores requirement + Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(new TesResources { CpuCores = 1, RamGb = 1, Preemptible = false }, azureProxyReturnValues)); + Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(new TesResources { CpuCores = 1, RamGb = 1, Preemptible = true }, azureProxyReturnValues)); + azureProxyReturnValues.ActiveNodeCountByVmSize = new List { new() { VirtualMachineSize = "VmSize1", DedicatedNodeCount = 4, LowPriorityNodeCount = 7 } // 8 dedicated and 14 low pri cores are in use }; Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(new TesResources { CpuCores = 1, RamGb = 1, Preemptible = true }, azureProxyReturnValues)); + + var dedicatedCoreQuotaPerVMFamily = new List { new("VmFamily1", 100) }; + azureProxyReturnValues.BatchQuotas = new() { ActiveJobAndJobScheduleQuota = 1, PoolQuota = 1, DedicatedCoreQuota = 9, LowPriorityCoreQuota = 17, DedicatedCoreQuotaPerVMFamilyEnforced = true, DedicatedCoreQuotaPerVMFamily = dedicatedCoreQuotaPerVMFamily }; + + azureProxyReturnValues.ActiveNodeCountByVmSize = new List { + new() { VirtualMachineSize = "VmSize1", DedicatedNodeCount = 4, LowPriorityNodeCount = 8 } // 8 (4 * 2) dedicated and 16 (8 * 2) low pri cores are in use, there is no more room for 2 cores + }; + + Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(new TesResources { CpuCores = 1, RamGb = 1, Preemptible = false }, azureProxyReturnValues)); } private async Task AddBatchTaskHandlesExceptions(TesState newState, Func, Action>)> testArranger, Action> resultValidator) @@ -607,7 +623,7 @@ public async Task BatchJobContainsExpectedBatchPoolInformation() GetMockAllowedVms(config)); var batchScheduler = serviceProvider.GetT(); - await foreach (var _ in batchScheduler.ProcessQueuedTesTasksAsync(new[] { tesTask }, System.Threading.CancellationToken.None)); + await foreach (var _ in batchScheduler.ProcessQueuedTesTasksAsync(new[] { tesTask }, System.Threading.CancellationToken.None)) { } var createBatchPoolAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.CreateBatchPoolAsync)); var addBatchTaskAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.AddBatchTaskAsync)); @@ -1666,9 +1682,6 @@ private static Action> GetMockAzureProxy(AzureProxyReturnValue azureProxy.Setup(a => a.GetActivePoolsAsync(It.IsAny())) .Returns(AsyncEnumerable.Empty()); - //azureProxy.Setup(a => a.GetBatchJobAndTaskStateAsync(It.IsAny(), It.IsAny())) - // .Returns(Task.FromResult(azureProxyReturnValues.BatchJobAndTaskState)); - azureProxy.Setup(a => a.GetStorageAccountInfoAsync("defaultstorageaccount", It.IsAny())) .Returns(Task.FromResult(azureProxyReturnValues.StorageAccountInfos["defaultstorageaccount"])); @@ -1822,20 +1835,6 @@ internal static async ValueTask GuardAssertsWithTesTask(TesTask tesTask, Func default; diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 85657043e..6528a6bf8 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -236,143 +236,6 @@ public async Task DeleteBatchJobAsync(PoolInformation poolInformation, Cancellat await batchClient.JobOperations.DeleteJobAsync(poolInformation.PoolId, cancellationToken: cancellationToken); } - ///// - //[System.Diagnostics.CodeAnalysis.SuppressMessage("Performance", "CA1826:Do not use Enumerable methods on indexable collections", Justification = "FirstOrDefault() is straightforward, the alternative is less clear.")] - //public async Task GetBatchJobAndTaskStateAsync(TesTask tesTask, CancellationToken cancellationToken) - //{ - // try - // { - // string nodeErrorCode = null; - // IEnumerable nodeErrorDetails = null; - // var activeJobWithMissingAutoPool = false; - // ComputeNodeState? nodeState = null; - // TaskState? taskState = null; - // string poolId = null; - // TaskExecutionInformation taskExecutionInformation = null; - // CloudJob job = null; - // var attemptNumber = 0; - // CloudTask batchTask = null; - - // var jobOrTaskFilter = new ODATADetailLevel - // { - // FilterClause = $"startswith(id,'{tesTask.Id}{BatchJobAttemptSeparator}')", - // SelectClause = "*" - // }; - - // if (string.IsNullOrWhiteSpace(tesTask.PoolId)) - // { - // return new AzureBatchJobAndTaskState { JobState = null }; - // } - - // try - // { - // job = await batchClient.JobOperations.GetJobAsync(tesTask.PoolId, cancellationToken: cancellationToken); - // } - // catch (BatchException ex) when (ex.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException e && e.Response.StatusCode == System.Net.HttpStatusCode.NotFound) - // { - // logger.LogError(ex, @"Failed to get job for TesTask {TesTask}", tesTask.Id); - // return new AzureBatchJobAndTaskState { JobState = null }; - // } - - // var taskInfos = await batchClient.JobOperations.ListTasks(tesTask.PoolId, jobOrTaskFilter).ToAsyncEnumerable() - // .Select(t => new { Task = t, AttemptNumber = int.Parse(t.Id.Split(BatchJobAttemptSeparator)[1]) }) - // .ToListAsync(cancellationToken); - - // if (!taskInfos.Any()) - // { - // logger.LogError(@"Failed to get task for TesTask {TesTask}", tesTask.Id); - // } - // else - // { - // if (taskInfos.Count(t => t.Task.State != TaskState.Completed) > 1) - // { - // return new AzureBatchJobAndTaskState { MoreThanOneActiveJobOrTaskFound = true }; - // } - - // var lastTaskInfo = taskInfos.OrderBy(t => t.AttemptNumber).Last(); - // batchTask = lastTaskInfo.Task; - // attemptNumber = lastTaskInfo.AttemptNumber; - // } - - // poolId = job.ExecutionInformation?.PoolId; - - - // var nodeId = string.Empty; - - // if (job.State == JobState.Active && poolId is not null) - // { - // var poolFilter = new ODATADetailLevel - // { - // SelectClause = "*" - // }; - - // CloudPool pool; - - // try - // { - // pool = await batchClient.PoolOperations.GetPoolAsync(poolId, poolFilter, cancellationToken: cancellationToken); - // } - // catch (BatchException ex) when (ex.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException e && e.Response?.StatusCode == System.Net.HttpStatusCode.NotFound) - // { - // pool = default; - // } - - // if (pool is not null) - // { - // var node = await pool.ListComputeNodes().ToAsyncEnumerable().FirstOrDefaultAsync(n => (n.RecentTasks?.Select(t => t.TaskId) ?? Enumerable.Empty()).Contains(batchTask?.Id), cancellationToken); - - // if (node is not null) - // { - // nodeId = node.Id; - // nodeState = node.State; - // var nodeError = node.Errors?.FirstOrDefault(e => "DiskFull".Equals(e.Code, StringComparison.InvariantCultureIgnoreCase)) ?? node.Errors?.FirstOrDefault(); // Prioritize DiskFull errors - // nodeErrorCode = nodeError?.Code; - // nodeErrorDetails = nodeError?.ErrorDetails?.Select(e => e.Value); - // } - // } - // else - // { - // if (job.CreationTime.HasValue && DateTime.UtcNow.Subtract(job.CreationTime.Value) > TimeSpan.FromMinutes(30)) - // { - // activeJobWithMissingAutoPool = true; - // } - // } - // } - - // if (batchTask is not null) - // { - // taskState = batchTask.State; - // taskExecutionInformation = batchTask.ExecutionInformation; - // } - - // return new AzureBatchJobAndTaskState - // { - // MoreThanOneActiveJobOrTaskFound = false, - // ActiveJobWithMissingAutoPool = activeJobWithMissingAutoPool, - // AttemptNumber = attemptNumber, - // NodeErrorCode = nodeErrorCode, - // NodeErrorDetails = nodeErrorDetails, - // NodeState = nodeState, - // JobState = job.State, - // TaskState = taskState, - // Pool = new() { PoolId = poolId }, - // TaskExecutionResult = taskExecutionInformation?.Result, - // TaskStartTime = taskExecutionInformation?.StartTime, - // TaskEndTime = taskExecutionInformation?.EndTime, - // TaskExitCode = taskExecutionInformation?.ExitCode, - // TaskFailureInformation = taskExecutionInformation?.FailureInformation, - // TaskContainerState = taskExecutionInformation?.ContainerInformation?.State, - // TaskContainerError = taskExecutionInformation?.ContainerInformation?.Error, - // NodeId = !string.IsNullOrEmpty(nodeId) ? nodeId : null - // }; - // } - // catch (Exception ex) - // { - // logger.LogError(ex, @"GetBatchJobAndTaskStateAsync failed for TesTask {TesTask}", tesTask.Id); - // throw; - // } - //} - /// public async Task DeleteBatchJobAsync(string tesTaskId, CancellationToken cancellationToken = default) { diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index f5b53e7d3..a15927118 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -14,7 +14,6 @@ using Microsoft.Azure.Batch.Common; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; -using Newtonsoft.Json; using Tes.Extensions; using Tes.Models; using TesApi.Web.Extensions; @@ -308,7 +307,7 @@ private async Task TerminateBatchTaskAsync(TesTask tesTask, CancellationTo tesTask.State = TesState.CANCELEDEnum; return true; } - //TODO: catch exception returned if the task as already completed. + //TODO: catch exception returned if the task was already completed. catch (Exception exc) { logger.LogError(exc, "Exception terminating batch task with tesTask.Id: {TesTaskId}", tesTask?.Id); @@ -367,6 +366,21 @@ public async Task UploadTaskRunnerIfNeeded(CancellationToken cancellationToken) } } + /// + public IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken) + { + ArgumentNullException.ThrowIfNull(tesTasks); + ArgumentNullException.ThrowIfNull(taskStates); + + return taskStates.Zip(tesTasks, (TaskState, TesTask) => (TaskState, TesTask)) + .Where(entry => entry.TesTask.IsActiveState()) // Removes already terminal TesTasks + .Select(entry => (entry.TesTask, IsModifiedAsync: WrapHandleTesTaskTransitionAsync(entry.TesTask, entry.TaskState, cancellationToken))) + .WhenEach(cancellationToken, tuple => tuple.IsModifiedAsync); + + async Task WrapHandleTesTaskTransitionAsync(TesTask tesTask, AzureBatchTaskState azureBatchTaskState, CancellationToken cancellationToken) + => await HandleTesTaskTransitionAsync(tesTask, azureBatchTaskState, cancellationToken); + } + private static string GetCromwellExecutionDirectoryPathAsUrl(TesTask task) { var commandScript = task.Inputs?.FirstOrDefault(IsCromwellCommandScript); @@ -682,253 +696,6 @@ Task HandleException(Exception exception, string poolKey, TesTask tesTask) } } - ///// - ///// Gets the current state of the Azure Batch task - ///// - ///// - ///// A for controlling the lifetime of the asynchronous operation. - ///// A higher-level abstraction of the current state of the Azure Batch task - //[System.Diagnostics.CodeAnalysis.SuppressMessage("Performance", "CA1826:Do not use Enumerable methods on indexable collections", Justification = "FirstOrDefault() is straightforward, the alternative is less clear.")] - //private async ValueTask GetBatchTaskStateAsync(TesTask tesTask, CloudTask cloudTask, CancellationToken cancellationToken) - //{ - // var azureBatchJobAndTaskState = await azureProxy.GetBatchJobAndTaskStateAsync(tesTask, cancellationToken); - - // if (azureBatchJobAndTaskState.Pool?.PoolId is null) - // { - // azureBatchJobAndTaskState.Pool = tesTask.PoolId is null ? default : new() { PoolId = tesTask.PoolId }; - // } - - // static IEnumerable ConvertNodeErrorsToSystemLogItems(AzureBatchJobAndTaskState azureBatchJobAndTaskState) - // { - // var systemLogItems = new List(); - - // if (azureBatchJobAndTaskState.NodeErrorCode is not null) - // { - // systemLogItems.Add(azureBatchJobAndTaskState.NodeErrorCode); - // } - - // if (azureBatchJobAndTaskState.NodeErrorDetails is not null) - // { - // systemLogItems.AddRange(azureBatchJobAndTaskState.NodeErrorDetails); - // } - - // return systemLogItems; - // } - - // if (azureBatchJobAndTaskState.ActiveJobWithMissingAutoPool) - // { - // logger.LogWarning("Found active job without auto pool for TES task {TesTask}. Deleting the job and requeuing the task. BatchJobInfo: {BatchJobInfo}", tesTask.Id, JsonConvert.SerializeObject(azureBatchJobAndTaskState)); - // return new CombinedBatchTaskInfo - // { - // BatchTaskState = BatchTaskState.ActiveJobWithMissingAutoPool, - // FailureReason = BatchTaskState.ActiveJobWithMissingAutoPool.ToString(), - // Pool = azureBatchJobAndTaskState.Pool - // }; - // } - - // if (azureBatchJobAndTaskState.MoreThanOneActiveJobOrTaskFound) - // { - // return new CombinedBatchTaskInfo - // { - // BatchTaskState = BatchTaskState.MoreThanOneActiveJobOrTaskFound, - // FailureReason = BatchTaskState.MoreThanOneActiveJobOrTaskFound.ToString(), - // Pool = azureBatchJobAndTaskState.Pool - // }; - // } - - // // Because a ComputeTask is not assigned to the compute node while the StartTask is running, IAzureProxy.GetBatchJobAndTaskStateAsync() does not see start task failures. Deal with that here. - // if (azureBatchJobAndTaskState.NodeState is null && azureBatchJobAndTaskState.JobState == JobState.Active && azureBatchJobAndTaskState.TaskState == TaskState.Active && !string.IsNullOrWhiteSpace(azureBatchJobAndTaskState.Pool?.PoolId)) - // { - // /* - // * Priority order for assigning errors to TesTasks in shared-pool mode: - // * 1. Node error found in GetBatchJobAndTaskStateAsync() - // * 2. StartTask failure - // * 3. NodeAllocation failure - // */ - // if (TryGetPool(azureBatchJobAndTaskState.Pool.PoolId, out var pool)) - // { - // if (!string.IsNullOrWhiteSpace(azureBatchJobAndTaskState.NodeErrorCode) || !ProcessStartTaskFailure(pool.PopNextStartTaskFailure())) - // { - // var resizeError = pool.PopNextResizeError(); - // if (resizeError is not null) - // { - // azureBatchJobAndTaskState.NodeAllocationFailed = true; - // azureBatchJobAndTaskState.NodeErrorCode = resizeError.Code; - // azureBatchJobAndTaskState.NodeErrorDetails = Enumerable.Repeat(resizeError.Message, string.IsNullOrWhiteSpace(resizeError.Message) ? 1 : 0).Concat(resizeError.Values?.Select(d => d.Value) ?? Enumerable.Empty()); - // } - // } - // } - - // bool ProcessStartTaskFailure(TaskFailureInformation failureInformation) - // { - // if (failureInformation is not null) - // { - // azureBatchJobAndTaskState.NodeState = ComputeNodeState.StartTaskFailed; - // azureBatchJobAndTaskState.NodeErrorCode = failureInformation.Code; - // azureBatchJobAndTaskState.NodeErrorDetails = failureInformation.Details?.Select(d => d.Value); - // } - - // return failureInformation is not null; - // } - // } - - // if (TaskFailureInformationCodes.DiskFull.Equals(azureBatchJobAndTaskState.NodeErrorCode, StringComparison.OrdinalIgnoreCase)) - // { - // azureBatchJobAndTaskState.NodeErrorDetails = (azureBatchJobAndTaskState.NodeErrorDetails ?? Enumerable.Empty()) - // .Append($"Compute Node Error: {TaskFailureInformationCodes.DiskFull} Id: {azureBatchJobAndTaskState.NodeId}"); - // } - - // switch (azureBatchJobAndTaskState.JobState) - // { - // case null: - // case JobState.Deleting: - // return new CombinedBatchTaskInfo - // { - // BatchTaskState = BatchTaskState.JobNotFound, - // FailureReason = BatchTaskState.JobNotFound.ToString(), - // Pool = azureBatchJobAndTaskState.Pool - // }; - // case JobState.Active: - // { - // if (azureBatchJobAndTaskState.NodeAllocationFailed) - // { - // return new CombinedBatchTaskInfo - // { - // BatchTaskState = BatchTaskState.NodeAllocationFailed, - // FailureReason = BatchTaskState.NodeAllocationFailed.ToString(), - // SystemLogItems = ConvertNodeErrorsToSystemLogItems(azureBatchJobAndTaskState), - // Pool = azureBatchJobAndTaskState.Pool - // }; - // } - - // if (azureBatchJobAndTaskState.NodeState == ComputeNodeState.Unusable) - // { - // return new CombinedBatchTaskInfo - // { - // BatchTaskState = BatchTaskState.NodeUnusable, - // FailureReason = BatchTaskState.NodeUnusable.ToString(), - // SystemLogItems = ConvertNodeErrorsToSystemLogItems(azureBatchJobAndTaskState), - // Pool = azureBatchJobAndTaskState.Pool - // }; - // } - - // if (azureBatchJobAndTaskState.NodeState == ComputeNodeState.Preempted) - // { - // return new CombinedBatchTaskInfo - // { - // BatchTaskState = BatchTaskState.NodePreempted, - // FailureReason = BatchTaskState.NodePreempted.ToString(), - // SystemLogItems = ConvertNodeErrorsToSystemLogItems(azureBatchJobAndTaskState), - // Pool = azureBatchJobAndTaskState.Pool - // }; - // } - - // if (azureBatchJobAndTaskState.NodeErrorCode is not null) - // { - // if (azureBatchJobAndTaskState.NodeErrorCode == TaskFailureInformationCodes.DiskFull) - // { - // return new CombinedBatchTaskInfo - // { - // BatchTaskState = BatchTaskState.NodeFailedDuringStartupOrExecution, - // FailureReason = azureBatchJobAndTaskState.NodeErrorCode, - // SystemLogItems = ConvertNodeErrorsToSystemLogItems(azureBatchJobAndTaskState), - // Pool = azureBatchJobAndTaskState.Pool - // }; - // } - // else - // { - // return new CombinedBatchTaskInfo - // { - // BatchTaskState = BatchTaskState.NodeFailedDuringStartupOrExecution, - // FailureReason = BatchTaskState.NodeFailedDuringStartupOrExecution.ToString(), - // SystemLogItems = ConvertNodeErrorsToSystemLogItems(azureBatchJobAndTaskState), - // Pool = azureBatchJobAndTaskState.Pool - // }; - // } - // } - - // break; - // } - // case JobState.Terminating: - // case JobState.Completed: - // break; - // default: - // throw new Exception($"Found batch job {tesTask.Id} in unexpected state: {azureBatchJobAndTaskState.JobState}"); - // } - - // switch (azureBatchJobAndTaskState.TaskState) - // { - // case null: - // return new CombinedBatchTaskInfo - // { - // BatchTaskState = BatchTaskState.MissingBatchTask, - // FailureReason = BatchTaskState.MissingBatchTask.ToString(), - // Pool = azureBatchJobAndTaskState.Pool - // }; - // case TaskState.Active: - // case TaskState.Preparing: - // return new CombinedBatchTaskInfo - // { - // BatchTaskState = BatchTaskState.Initializing, - // Pool = azureBatchJobAndTaskState.Pool - // }; - // case TaskState.Running: - // return new CombinedBatchTaskInfo - // { - // BatchTaskState = BatchTaskState.Running, - // Pool = azureBatchJobAndTaskState.Pool - // }; - // case TaskState.Completed: - // if (azureBatchJobAndTaskState.TaskExitCode == 0 && azureBatchJobAndTaskState.TaskFailureInformation is null) - // { - // var metrics = await GetBatchNodeMetricsAndCromwellResultCodeAsync(tesTask, cancellationToken); - - // return new CombinedBatchTaskInfo - // { - // BatchTaskState = BatchTaskState.CompletedSuccessfully, - // BatchTaskExitCode = azureBatchJobAndTaskState.TaskExitCode, - // BatchTaskStartTime = metrics.TaskStartTime ?? azureBatchJobAndTaskState.TaskStartTime, - // BatchTaskEndTime = metrics.TaskEndTime ?? azureBatchJobAndTaskState.TaskEndTime, - // BatchNodeMetrics = metrics.BatchNodeMetrics, - // CromwellRcCode = metrics.CromwellRcCode, - // Pool = azureBatchJobAndTaskState.Pool - // }; - // } - // else - // { - // logger.LogError("Task {TesTask} failed. ExitCode: {TaskExitCode}, BatchJobInfo: {BatchJobInfo}", tesTask.Id, azureBatchJobAndTaskState.TaskExitCode, JsonConvert.SerializeObject(azureBatchJobAndTaskState)); - - // return new CombinedBatchTaskInfo - // { - // BatchTaskState = BatchTaskState.CompletedWithErrors, - // FailureReason = azureBatchJobAndTaskState.TaskFailureInformation?.Code, - // BatchTaskExitCode = azureBatchJobAndTaskState.TaskExitCode, - // BatchTaskStartTime = azureBatchJobAndTaskState.TaskStartTime, - // BatchTaskEndTime = azureBatchJobAndTaskState.TaskEndTime, - // SystemLogItems = Enumerable.Empty() - // .Append($"Batch task ExitCode: {azureBatchJobAndTaskState.TaskExitCode}, Failure message: {azureBatchJobAndTaskState.TaskFailureInformation?.Message}") - // .Concat(azureBatchJobAndTaskState.TaskFailureInformation?.Details?.Select(d => $"{d.Name}: {d.Value}") ?? Enumerable.Empty()), - // Pool = azureBatchJobAndTaskState.Pool - // }; - // } - // default: - // throw new Exception($"Found batch task {tesTask.Id} in unexpected state: {azureBatchJobAndTaskState.TaskState}"); - // } - //} - - - /// - public IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken) - { - ArgumentNullException.ThrowIfNull(tesTasks); - ArgumentNullException.ThrowIfNull(taskStates); - - return taskStates.Zip(tesTasks, (TaskState, TesTask) => (TaskState, TesTask)) - .Where(entry => entry.TesTask.IsActiveState()) // Removes already terminal TesTasks - .Select(entry => (entry.TesTask, IsModifiedAsync: HandleTesTaskTransitionAsync(entry.TesTask, entry.TaskState, cancellationToken).AsTask())) - .WhenEach(cancellationToken, tuple => tuple.IsModifiedAsync); - } - /// /// Transitions the to the new state, based on the rules defined in the tesTaskStateTransitions list. /// diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 8348b45fb..87d74c4f7 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -94,10 +94,10 @@ public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, PoolI public Task DeleteBatchJobAsync(string taskId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(taskId, ct), cancellationToken); /// - public Task TerminateBatchTaskAsync(string taskId, string jobId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.TerminateBatchTaskAsync(taskId, jobId, ct), cancellationToken); + public Task DeleteBatchTaskAsync(string taskId, string jobId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(taskId, jobId, ct), cancellationToken); /// - public Task DeleteBatchTaskAsync(string taskId, string jobId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(taskId, jobId, ct), cancellationToken); + public Task TerminateBatchTaskAsync(string taskId, string jobId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.TerminateBatchTaskAsync(taskId, jobId, ct), cancellationToken); /// public Task DeleteBatchPoolAsync(string poolId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchPoolAsync(poolId, ct), cancellationToken); diff --git a/src/TesApi.Web/IAzureProxy.cs b/src/TesApi.Web/IAzureProxy.cs index 543b2183e..e86c36306 100644 --- a/src/TesApi.Web/IAzureProxy.cs +++ b/src/TesApi.Web/IAzureProxy.cs @@ -7,7 +7,6 @@ using System.Threading.Tasks; using Microsoft.Azure.Batch; using Microsoft.Azure.Batch.Common; -using Tes.Models; using TesApi.Web.Storage; using BatchModels = Microsoft.Azure.Management.Batch.Models; @@ -57,8 +56,6 @@ public interface IAzureProxy /// A for controlling the lifetime of the asynchronous operation. Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, bool isPreemptable, CancellationToken cancellationToken); - // TODO: we may need a query or two to replace what was here - /// /// Deletes an Azure Batch job for Autopools /// diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index 026d1dd1a..999ea0a8f 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -36,23 +36,21 @@ public interface IBatchScheduler Task UploadTaskRunnerIfNeeded(CancellationToken cancellationToken); /// - /// Schedule queued s on a batch system + /// Update s with task-related state on a batch system /// /// s to schedule on the batch system. + /// s corresponding to each . /// A for controlling the lifetime of the asynchronous operation. /// True for each corresponding that needs to be persisted. - IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, CancellationToken cancellationToken); + IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken); /// - /// Assign pool and compute node errors to s on a batch system + /// Schedule queued s on a batch system /// /// s to schedule on the batch system. - /// s corresponding to each . /// A for controlling the lifetime of the asynchronous operation. /// True for each corresponding that needs to be persisted. - IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken); - - //TODO: Add other task update methods here. + IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, CancellationToken cancellationToken); /// /// Adds to the managed batch pools. diff --git a/src/TesApi.Web/Management/BatchQuotaVerifier.cs b/src/TesApi.Web/Management/BatchQuotaVerifier.cs index 2d96ce5aa..df0c14fb6 100644 --- a/src/TesApi.Web/Management/BatchQuotaVerifier.cs +++ b/src/TesApi.Web/Management/BatchQuotaVerifier.cs @@ -114,6 +114,11 @@ public async Task CheckBatchAccountQuotasAsync(VirtualMachineInformation virtual throw exception; } } + + if ((batchUtilization.TotalCoresInUse + workflowCoresRequirement) > batchVmFamilyBatchQuotas.TotalCoreQuota) + { + throw new AzureBatchQuotaMaxedOutException($"Not enough core quota remaining to schedule task requiring {workflowCoresRequirement} {(isDedicated ? "dedicated" : "low priority")} cores. There are {batchUtilization.TotalCoresInUse} cores in use out of {batchVmFamilyBatchQuotas.TotalCoreQuota}."); + } } /// From 594ab0f166ccb221b7c4b051005da6171ce43b55 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 4 Oct 2023 06:48:33 -0700 Subject: [PATCH 016/202] Correct query format --- src/TesApi.Web/BatchPool.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index d0a0d5ba0..1086a87d8 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -464,7 +464,7 @@ await PerformTask(ServicePoolAsync(ServiceKind.Rotate, cancellationToken), cance switch (exceptions.Count) { case 0: - await foreach (var (id, state) in _azureProxy.ListTasksAsync(Pool.PoolId, new ODATADetailLevel { FilterClause = "id", SelectClause = "state eq active" }).Select(cloud => cloud.Id).Zip(GetFailures(cancellationToken), (id, state) => (id, state)).WithCancellation(cancellationToken)) + await foreach (var (id, state) in _azureProxy.ListTasksAsync(Pool.PoolId, new ODATADetailLevel { FilterClause = "id", SelectClause = "state eq 'active'" }).Select(cloud => cloud.Id).Zip(GetFailures(cancellationToken), (id, state) => (id, state)).WithCancellation(cancellationToken)) { yield return (id, state); } @@ -540,7 +540,7 @@ TaskFailureInformation PopNextStartTaskFailure() /// public IAsyncEnumerable GetCompletedTasks(CancellationToken _1) { - return _azureProxy.ListTasksAsync(Pool.PoolId, new ODATADetailLevel { FilterClause = "id,executionInfo", SelectClause = "state eq completed" }); + return _azureProxy.ListTasksAsync(Pool.PoolId, new ODATADetailLevel { FilterClause = "id,executionInfo", SelectClause = "state eq 'completed'" }); } // Returns false when pool/job was removed because it was not found. Returns true if the error was completely something else. From 3b896d412bf1794e2b48e1f94a18386c21c72eed Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 4 Oct 2023 07:35:22 -0700 Subject: [PATCH 017/202] Correct query properties --- src/TesApi.Web/BatchPool.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 1086a87d8..8afc85ecc 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -464,7 +464,7 @@ await PerformTask(ServicePoolAsync(ServiceKind.Rotate, cancellationToken), cance switch (exceptions.Count) { case 0: - await foreach (var (id, state) in _azureProxy.ListTasksAsync(Pool.PoolId, new ODATADetailLevel { FilterClause = "id", SelectClause = "state eq 'active'" }).Select(cloud => cloud.Id).Zip(GetFailures(cancellationToken), (id, state) => (id, state)).WithCancellation(cancellationToken)) + await foreach (var (id, state) in _azureProxy.ListTasksAsync(Pool.PoolId, new ODATADetailLevel { SelectClause = "id", FilterClause = "state eq 'active'" }).Select(cloud => cloud.Id).Zip(GetFailures(cancellationToken), (id, state) => (id, state)).WithCancellation(cancellationToken)) { yield return (id, state); } @@ -540,7 +540,7 @@ TaskFailureInformation PopNextStartTaskFailure() /// public IAsyncEnumerable GetCompletedTasks(CancellationToken _1) { - return _azureProxy.ListTasksAsync(Pool.PoolId, new ODATADetailLevel { FilterClause = "id,executionInfo", SelectClause = "state eq 'completed'" }); + return _azureProxy.ListTasksAsync(Pool.PoolId, new ODATADetailLevel { SelectClause = "id,executionInfo", FilterClause = "state eq 'completed'" }); } // Returns false when pool/job was removed because it was not found. Returns true if the error was completely something else. From c0d1f6b835580310ebaefc1fab6595acb156b003 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 4 Oct 2023 12:12:32 -0700 Subject: [PATCH 018/202] TEMP: Change logging for visibility --- src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs | 3 ++- src/TesApi.Web/appsettings.json | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs index c6c53b8c7..a1d555cca 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs @@ -142,6 +142,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< if (0 == tesTasks.Length) { + logger.LogInformation("OrchestrateTesTasksOnBatch({Poll}) skipped.", pollName); return; } @@ -273,7 +274,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< await batchScheduler.FlushPoolsAsync(pools, stoppingToken); } - logger.LogDebug("OrchestrateTesTasksOnBatch({Poll}) for {TaskCount} tasks completed in {TotalSeconds} seconds.", pollName, tesTasks.Length, DateTime.UtcNow.Subtract(startTime).TotalSeconds); + logger.LogInformation("OrchestrateTesTasksOnBatch({Poll}) for {TaskCount} tasks completed in {TotalSeconds} seconds.", pollName, tesTasks.Length, DateTime.UtcNow.Subtract(startTime).TotalSeconds); } } } diff --git a/src/TesApi.Web/appsettings.json b/src/TesApi.Web/appsettings.json index 59f838cd6..b98e90920 100644 --- a/src/TesApi.Web/appsettings.json +++ b/src/TesApi.Web/appsettings.json @@ -1,7 +1,7 @@ { "Logging": { "LogLevel": { - "Default": "Warning" + "Default": "Information" } }, "AllowedHosts": "*", From 3b74390536531760b1caed2de05f76c7736b3453 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 4 Oct 2023 15:07:02 -0700 Subject: [PATCH 019/202] Rename OrchestrateOnBatchSchedulerService to OrchestrateOnBatchSchedulerServiceBase --- src/TesApi.Web/BatchPoolService.cs | 2 +- ...erService.cs => OrchestrateOnBatchSchedulerServiceBase.cs} | 4 ++-- src/TesApi.Web/Scheduler.cs | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) rename src/TesApi.Web/{OrchestrateOnBatchSchedulerService.cs => OrchestrateOnBatchSchedulerServiceBase.cs} (98%) diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index a1522b110..99919bda2 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -20,7 +20,7 @@ namespace TesApi.Web /// This should only be used as a system-wide singleton service. This class does not support scale-out on multiple machines, /// nor does it implement a leasing mechanism. In the future, consider using the Lease Blob operation. /// - internal class BatchPoolService : OrchestrateOnBatchSchedulerService + internal class BatchPoolService : OrchestrateOnBatchSchedulerServiceBase { /// /// Interval between each call to . diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs similarity index 98% rename from src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs rename to src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index a1d555cca..82618fbd1 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerService.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -20,7 +20,7 @@ namespace TesApi.Web /// This should only be used to build system-wide singleton services. This class does not support scale-out on multiple machines, /// nor does it implement a leasing mechanism. In the future, consider using the Lease Blob operation. /// - internal abstract class OrchestrateOnBatchSchedulerService : BackgroundService + internal abstract class OrchestrateOnBatchSchedulerServiceBase : BackgroundService { protected readonly IRepository repository; protected readonly IBatchScheduler batchScheduler; @@ -32,7 +32,7 @@ internal abstract class OrchestrateOnBatchSchedulerService : BackgroundService /// The main TES task database repository implementation /// The batch scheduler implementation /// The logger instance - protected OrchestrateOnBatchSchedulerService(IRepository repository, IBatchScheduler batchScheduler, ILogger logger) + protected OrchestrateOnBatchSchedulerServiceBase(IRepository repository, IBatchScheduler batchScheduler, ILogger logger) { this.repository = repository; this.batchScheduler = batchScheduler; diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index a80abcd39..1e8e74adf 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -17,7 +17,7 @@ namespace TesApi.Web /// This should only be used as a system-wide singleton service. This class does not support scale-out on multiple machines, /// nor does it implement a leasing mechanism. In the future, consider using the Lease Blob operation. /// - internal class Scheduler : OrchestrateOnBatchSchedulerService + internal class Scheduler : OrchestrateOnBatchSchedulerServiceBase { private readonly TimeSpan blobRunInterval = TimeSpan.FromSeconds(5); private readonly TimeSpan batchRunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs uses a 30 second polling interval From 945649f519e501c7eb469d276b45b6bb9d59d82e Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 5 Oct 2023 06:41:35 -0700 Subject: [PATCH 020/202] Further isolate the issues --- src/TesApi.Tests/BatchPoolTests.cs | 101 ++++++++++++++++-- src/TesApi.Web/AzureProxy.cs | 25 +++-- src/TesApi.Web/BatchPool.cs | 24 +++-- src/TesApi.Web/BatchPoolService.cs | 1 + .../CachingWithRetriesAzureProxy.cs | 3 + src/TesApi.Web/IAzureProxy.cs | 48 ++++++--- .../Management/Batch/ArmBatchPoolManager.cs | 6 +- .../OrchestrateOnBatchSchedulerServiceBase.cs | 4 +- src/TesApi.Web/appsettings.json | 7 ++ 9 files changed, 172 insertions(+), 47 deletions(-) diff --git a/src/TesApi.Tests/BatchPoolTests.cs b/src/TesApi.Tests/BatchPoolTests.cs index bd455df3f..14ffd62d0 100644 --- a/src/TesApi.Tests/BatchPoolTests.cs +++ b/src/TesApi.Tests/BatchPoolTests.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Threading; using System.Threading.Tasks; using Microsoft.Azure.Batch; using Microsoft.Azure.Management.Batch.Models; @@ -97,12 +98,31 @@ void DeletePool(string poolId, System.Threading.CancellationToken cancellationTo } } + [TestMethod] + public async Task ServicePoolGetResizeErrorsResetsAutoScalingWhenBatchStopsEvaluatingAutoScaleAfterAutoScaleFormulaError() + { + var azureProxy = AzureProxyReturnValues.Get(); + var services = GetServiceProvider(azureProxy); + var pool = await AddPool(services.GetT(), false); + + azureProxy.SetPoolState( + pool.Pool.PoolId, + enableAutoScale: true, + autoScaleRun: new(DateTime.UtcNow, error: new("ErrorCode", "Message"))); + + await pool.ServicePoolAsync(BatchPool.ServiceKind.GetResizeErrors); + await pool.ServicePoolAsync(BatchPool.ServiceKind.ManagePoolScaling); + + services.AzureProxy.Verify(a => a.DisableBatchPoolAutoScaleAsync(pool.Pool.PoolId, It.IsAny())); + } + [TestMethod] public async Task ServicePoolGetResizeErrorsResetsAutoScalingWhenBatchStopsEvaluatingAutoScaleAfterQuotaError() { var azureProxy = AzureProxyReturnValues.Get(); var services = GetServiceProvider(azureProxy); var pool = await AddPool(services.GetT(), false); + pool.TimeShift(7 * BatchPool.AutoScaleEvaluationInterval); azureProxy.SetPoolState( pool.Pool.PoolId, @@ -115,6 +135,26 @@ public async Task ServicePoolGetResizeErrorsResetsAutoScalingWhenBatchStopsEvalu services.AzureProxy.Verify(a => a.DisableBatchPoolAutoScaleAsync(pool.Pool.PoolId, It.IsAny())); } + [TestMethod] + public async Task ServicePoolGetResizeErrorsResetsAutoScalingWhenBatchStopsEvaluatingAutoScaleLongAllocationStateTransitionDelay() + { + var azureProxy = AzureProxyReturnValues.Get(); + azureProxy.EvaluateAutoScale = (id, formula) => GenerateAutoScaleRun(error: new()); + var services = GetServiceProvider(azureProxy); + var pool = await AddPool(services.GetT(), false); + pool.TimeShift(12 * BatchPool.AutoScaleEvaluationInterval); + + azureProxy.SetPoolState( + pool.Pool.PoolId, + enableAutoScale: true, + resizeErrors: Enumerable.Repeat(new(Microsoft.Azure.Batch.Common.PoolResizeErrorCodes.AccountCoreQuotaReached, "Core quota reached."), 1).ToList(), + allocationStateTransitionTime: DateTime.UtcNow - (11 * BatchPool.AutoScaleEvaluationInterval)); + + await pool.ServicePoolAsync(BatchPool.ServiceKind.GetResizeErrors); + await pool.ServicePoolAsync(BatchPool.ServiceKind.ManagePoolScaling); + + services.AzureProxy.Verify(a => a.DisableBatchPoolAutoScaleAsync(pool.Pool.PoolId, It.IsAny())); + } private static TestServices.TestServiceProvider GetServiceProvider(AzureProxyReturnValues azureProxyReturn = default) { @@ -147,6 +187,7 @@ internal static AzureProxyReturnValues Get() internal Func AzureProxyGetComputeNodeAllocationState { get; set; } = null; internal Action AzureProxyDeleteBatchPool { get; set; } = (poolId, cancellationToken) => { }; internal Func> AzureProxyListTasks { get; set; } = (jobId, detailLevel) => AsyncEnumerable.Empty(); + internal Func EvaluateAutoScale { get; set; } //= new((poolId, autoscaleFormula) => AutoScaleRun); internal List VmSizesAndPrices { get; set; } = new(); internal static Func AzureProxyGetComputeNodeAllocationStateDefault = id => (Microsoft.Azure.Batch.Common.AllocationState.Steady, true, 0, 0, 0, 0); @@ -154,7 +195,7 @@ internal static AzureProxyReturnValues Get() internal bool PoolStateExists(string poolId) => poolState.ContainsKey(poolId); - private readonly Dictionary PoolMetadata)> poolState = new(); + private readonly Dictionary ResizeErrors, IList PoolMetadata)> poolState = new(); internal void SetPoolState( string id, @@ -162,10 +203,11 @@ internal void SetPoolState( int? currentLowPriorityNodes = default, int? targetDedicatedNodes = default, int? targetLowPriorityNodes = default, + DateTime? allocationStateTransitionTime = default, Microsoft.Azure.Batch.Common.AllocationState? allocationState = default, + IList resizeErrors = default, Microsoft.Azure.Batch.Protocol.Models.AutoScaleRun autoScaleRun = default, bool? enableAutoScale = default, - DateTime? creationTime = default, IList poolMetadata = default) { if (poolState.TryGetValue(id, out var state)) @@ -191,7 +233,8 @@ internal void SetPoolState( allocationState ?? state.AllocationState, autoScaleRun ?? state.AutoScaleRun, enableAutoScale ?? state.EnableAutoScale, - creationTime ?? state.CreationTime, + allocationStateTransitionTime ?? state.AllocationStateTransitionTime, + resizeErrors ?? state.ResizeErrors, metadata.Count == 0 ? null : metadata.Select(ConvertMetadata).ToList()); static Microsoft.Azure.Batch.MetadataItem ConvertMetadata(KeyValuePair pair) @@ -199,7 +242,7 @@ static Microsoft.Azure.Batch.MetadataItem ConvertMetadata(KeyValuePair new(item.Name, item.Value); @@ -234,9 +298,10 @@ internal CloudPool GetBatchPoolImpl(string poolId) targetDedicatedNodes: state.TargetDedicatedNodes, targetLowPriorityNodes: state.TargetLowPriorityNodes, allocationState: state.AllocationState, + allocationStateTransitionTime: state.AllocationStateTransitionTime, + resizeErrors: state.ResizeErrors, autoScaleRun: state.AutoScaleRun, enableAutoScale: state.EnableAutoScale, - creationTime: state.CreationTime, metadata: state.PoolMetadata); } } @@ -293,9 +358,9 @@ private static Action> PrepareMockAzureProxy(AzureProxyReturnV azureProxy.Setup(a => a.ListTasksAsync(It.IsAny(), It.IsAny())).Returns((jobId, detailLevel) => azureProxyReturnValues.AzureProxyListTasks(jobId, detailLevel)); azureProxy.Setup(a => a.DeleteBatchComputeNodesAsync(It.IsAny(), It.IsAny>(), It.IsAny())).Callback, System.Threading.CancellationToken>((poolId, computeNodes, cancellationToken) => azureProxyReturnValues.AzureProxyDeleteBatchComputeNodes(poolId, computeNodes, cancellationToken)).Returns(Task.CompletedTask); azureProxy.Setup(a => a.GetBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())).Returns((string id, System.Threading.CancellationToken cancellationToken, DetailLevel detailLevel) => Task.FromResult(azureProxyReturnValues.GetBatchPoolImpl(id))); - azureProxy.Setup(a => a.GetFullAllocationStateAsync(It.IsAny(), It.IsAny())).Returns((string poolId, System.Threading.CancellationToken _1) => - Task.FromResult(GetPoolStateFromSettingStateOrDefault(poolId))); + azureProxy.Setup(a => a.GetFullAllocationStateAsync(It.IsAny(), It.IsAny())).Returns((string poolId, System.Threading.CancellationToken _1) => Task.FromResult(GetPoolStateFromSettingStateOrDefault(poolId))); azureProxy.Setup(a => a.DeleteBatchPoolAsync(It.IsAny(), It.IsAny())).Callback((poolId, cancellationToken) => azureProxyReturnValues.AzureProxyDeleteBatchPoolImpl(poolId, cancellationToken)).Returns(Task.CompletedTask); + azureProxy.Setup(a => a.EvaluateAutoScaleAsync(It.IsAny(), It.IsAny(), It.IsAny())).Returns((string poolId, string autoscaleFormula, CancellationToken _1) => Task.FromResult(azureProxyReturnValues.EvaluateAutoScale(poolId, autoscaleFormula))); (Microsoft.Azure.Batch.Common.AllocationState? AllocationState, bool? AutoScaleEnabled, int? TargetLowPriority, int? CurrentLowPriority, int? TargetDedicated, int? CurrentDedicated) GetPoolStateFromSettingStateOrDefault(string poolId) { @@ -336,6 +401,16 @@ public MockServiceClient(Microsoft.Azure.Batch.Protocol.IComputeNodeOperations c } // Below this line we use reflection and internal details of the Azure libraries in order to generate Mocks of CloudPool and ComputeNode. A newer version of the library is supposed to enable this scenario, so hopefully we can soon drop this code. + internal static Microsoft.Azure.Batch.AutoScaleRun GenerateAutoScaleRun(Microsoft.Azure.Batch.Protocol.Models.AutoScaleRunError error = default, string results = default) + { + var protocolObject = new Microsoft.Azure.Batch.Protocol.Models.AutoScaleRun(DateTime.UtcNow, results, error); + var autoScaleRun = (Microsoft.Azure.Batch.AutoScaleRun)typeof(Microsoft.Azure.Batch.AutoScaleRun).GetConstructor(System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance, default, new Type[] { typeof(Microsoft.Azure.Batch.Protocol.Models.AutoScaleRun) }, default) + .Invoke(new object[] { protocolObject }); + return autoScaleRun; + } + + + internal static CloudPool GeneratePool( string id, int? currentDedicatedNodes = default, @@ -343,6 +418,8 @@ internal static CloudPool GeneratePool( int? targetDedicatedNodes = default, int? targetLowPriorityNodes = default, Microsoft.Azure.Batch.Common.AllocationState? allocationState = Microsoft.Azure.Batch.Common.AllocationState.Steady, + DateTime? allocationStateTransitionTime = default, + IList resizeErrors = default, Microsoft.Azure.Batch.Protocol.Models.AutoScaleRun autoScaleRun = default, bool? enableAutoScale = default, DateTime? creationTime = default, @@ -368,6 +445,8 @@ internal static CloudPool GeneratePool( targetDedicatedNodes: targetDedicatedNodes, targetLowPriorityNodes: targetLowPriorityNodes, allocationState: (Microsoft.Azure.Batch.Protocol.Models.AllocationState)allocationState, + allocationStateTransitionTime: allocationStateTransitionTime, + resizeErrors: resizeErrors, autoScaleRun: autoScaleRun, enableAutoScale: enableAutoScale, creationTime: creationTime, diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 6528a6bf8..122e26d19 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -88,12 +88,12 @@ public AzureProxy(IOptions batchAccountOptions, IBatchPoolM .Handle(ex => "NodeNotReady".Equals(ex.RequestInformation?.BatchError?.Code, StringComparison.InvariantCultureIgnoreCase)) .WaitAndRetryAsync( 5, - (retryAttempt, exception, _) => (exception as BatchException).RequestInformation?.RetryAfter ?? TimeSpan.FromSeconds(Math.Pow(2, retryAttempt)), + (retryAttempt, exception, _) => (exception as BatchException)?.RequestInformation?.RetryAfter ?? TimeSpan.FromSeconds(Math.Pow(2, retryAttempt)), (exception, delay, retryAttempt, _) => { - var requestId = (exception as BatchException).RequestInformation?.ServiceRequestId; + var requestId = (exception as BatchException)?.RequestInformation?.ServiceRequestId; var reason = (exception.InnerException as Microsoft.Azure.Batch.Protocol.Models.BatchErrorException)?.Response?.ReasonPhrase; - logger.LogDebug(exception, "Retry attempt {RetryAttempt} after delay {DelaySeconds} for NodeNotReady exception: ServiceRequestId: {ServiceRequestId}, BatchErrorCode: NodeNotReady, Reason: {ReasonPhrase}", retryAttempt, delay.TotalSeconds, requestId, reason); + this.logger.LogDebug(exception, "Retry attempt {RetryAttempt} after delay {DelaySeconds} for NodeNotReady exception: ServiceRequestId: {ServiceRequestId}, BatchErrorCode: NodeNotReady, Reason: {ReasonPhrase}", retryAttempt, delay.TotalSeconds, requestId, reason); return Task.FromResult(false); }); @@ -536,7 +536,12 @@ private static Task GetAzureAccessTokenAsync(CancellationToken cancellat /// public async Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, bool isPreemptable, CancellationToken cancellationToken) - => await batchPoolManager.CreateBatchPoolAsync(poolInfo, isPreemptable, cancellationToken); + { + logger.LogInformation("Creating batch pool named {PoolName} with vmSize {PoolVmSize} and low priority {IsPreemptable}", poolInfo.Name, poolInfo.VmSize, isPreemptable); + var pool = await batchPoolManager.CreateBatchPoolAsync(poolInfo, isPreemptable, cancellationToken); + logger.LogInformation("Successfully created batch pool named {PoolName} with vmSize {PoolVmSize} and low priority {IsPreemptable}", poolInfo.Name, poolInfo.VmSize, isPreemptable); + return pool; + } // https://learn.microsoft.com/azure/azure-resource-manager/management/move-resource-group-and-subscription#changed-resource-id [GeneratedRegex("/*/resourceGroups/([^/]*)/*")] @@ -582,8 +587,8 @@ public IAsyncEnumerable ListTasksAsync(string jobId, DetailLevel deta => batchClient.JobOperations.ListTasks(jobId, detailLevel: detailLevel).ToAsyncEnumerable(); /// - public async Task DisableBatchPoolAutoScaleAsync(string poolId, CancellationToken cancellationToken) - => await batchClient.PoolOperations.DisableAutoScaleAsync(poolId, cancellationToken: cancellationToken); + public Task DisableBatchPoolAutoScaleAsync(string poolId, CancellationToken cancellationToken) + => batchClient.PoolOperations.DisableAutoScaleAsync(poolId, cancellationToken: cancellationToken); /// public async Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeSpan interval, IAzureProxy.BatchPoolAutoScaleFormulaFactory formulaFactory, CancellationToken cancellationToken) @@ -595,7 +600,13 @@ public async Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, throw new InvalidOperationException(); } - await batchClient.PoolOperations.EnableAutoScaleAsync(poolId, formulaFactory(preemptable, preemptable ? currentLowPriority ?? 0 : currentDedicated ?? 0), interval, cancellationToken: cancellationToken); + var formula = formulaFactory(preemptable, preemptable ? currentLowPriority ?? 0 : currentDedicated ?? 0); + logger.LogDebug("Setting Pool {PoolID} to AutoScale({AutoScaleInterval}): '{AutoScaleFormula}'", poolId, interval, formula); + await batchClient.PoolOperations.EnableAutoScaleAsync(poolId, formula, interval, cancellationToken: cancellationToken); } + + /// + public Task EvaluateAutoScaleAsync(string poolId, string autoscaleFormula, CancellationToken cancellationToken) + => batchClient.PoolOperations.EvaluateAutoScaleAsync(poolId, autoscaleFormula, cancellationToken: cancellationToken); } } diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 8afc85ecc..7fe6dd694 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -57,8 +57,11 @@ public BatchPool(IBatchScheduler batchScheduler, IOptions StartTaskFailures { get; } = new(); private Queue ResizeErrors { get; } = new(); + private IAsyncEnumerable GetTasksAsync(string select, string filter) + => _azureProxy.ListTasksAsync(Pool.PoolId, new ODATADetailLevel { SelectClause = select, FilterClause = filter }); + internal IAsyncEnumerable GetTasksAsync(bool includeCompleted) - => _azureProxy.ListTasksAsync(Pool.PoolId, new ODATADetailLevel { SelectClause = "id,stateTransitionTime", FilterClause = includeCompleted ? default : "state ne 'completed'" }); + => GetTasksAsync("id,stateTransitionTime", includeCompleted ? default : "state ne 'completed'"); private async ValueTask RemoveNodesAsync(IList nodesToRemove, CancellationToken cancellationToken) { @@ -120,10 +123,14 @@ private async ValueTask ServicePoolGetResizeErrorsAsync(CancellationToken cancel { ResizeErrors.Clear(); var pool = await _azureProxy.GetBatchPoolAsync(Pool.PoolId, cancellationToken, new ODATADetailLevel { SelectClause = "id,allocationStateTransitionTime,autoScaleFormula,autoScaleRun,resizeErrors" }); + var now = DateTime.UtcNow; + var autoScaleRunCutoff = now - (5 * AutoScaleEvaluationInterval); // It takes some cycles to reset autoscale, so give batch some time to catch up on its own. + var autoScaleTransitionCutoff = now - (10 * AutoScaleEvaluationInterval); - if (pool.AutoScaleRun?.Timestamp < DateTime.UtcNow - (5 * AutoScaleEvaluationInterval)) // It takes some cycles to reset autoscale, so give batch some time to catch up on its own. + if (pool.AutoScaleRun?.Error is not null || (autoScaleRunCutoff > Creation && pool.AutoScaleRun?.Timestamp < autoScaleRunCutoff)) { _resetAutoScalingRequired |= true; + _logger.LogDebug("Resetting AutoScale for pool {PoolId} because AutoScaleRun error or timestamp is older than {AutoScaleRunCutoff}.", Pool.PoolId, autoScaleRunCutoff); } else { @@ -141,21 +148,24 @@ private async ValueTask ServicePoolGetResizeErrorsAsync(CancellationToken cancel // Errors that sometimes require mitigation case PoolResizeErrorCodes.AccountCoreQuotaReached: case PoolResizeErrorCodes.AccountLowPriorityCoreQuotaReached: - if (pool.AllocationStateTransitionTime < DateTime.UtcNow - (10 * AutoScaleEvaluationInterval) && - (await pool.EvaluateAutoScaleAsync(pool.AutoScaleFormula, cancellationToken: cancellationToken))?.Error is not null) + if (autoScaleTransitionCutoff > Creation && pool.AllocationStateTransitionTime < autoScaleTransitionCutoff && + (await _azureProxy.EvaluateAutoScaleAsync(pool.Id, pool.AutoScaleFormula, cancellationToken: cancellationToken))?.Error is not null) { _resetAutoScalingRequired |= true; + _logger.LogDebug("Resetting AutoScale for pool {PoolId} because AllocationStateTransitionTime timestamp is older than {AllocationStateTransitionTimeCutoff} and formula evaluation fails.", Pool.PoolId, autoScaleTransitionCutoff); } break; // Errors to force autoscale to be reset case PoolResizeErrorCodes.ResizeStopped: _resetAutoScalingRequired |= true; + _logger.LogDebug("Resetting AutoScale for pool {PoolId} because pool resize was stopped.", Pool.PoolId); break; // Errors to both force resetting autoscale and fail tasks case PoolResizeErrorCodes.AllocationFailed: _resetAutoScalingRequired |= true; + _logger.LogDebug("Resetting AutoScale for pool {PoolId} because pool allocation failed.", Pool.PoolId); goto default; // Errors to fail tasks should be directed here @@ -283,6 +293,7 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc if (nodesToRemove.Any()) { await RemoveNodesAsync((IList)nodesToRemove, cancellationToken); + _resetAutoScalingRequired = false; _scalingMode = ScalingMode.RemovingFailedNodes; } else @@ -312,6 +323,7 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc case ScalingMode.SettingAutoScale: _scalingMode = ScalingMode.AutoScaleEnabled; + _logger.LogInformation(@"Pool {PoolId} is back to normal resize and monitoring status.", Pool.PoolId); break; } @@ -464,7 +476,7 @@ await PerformTask(ServicePoolAsync(ServiceKind.Rotate, cancellationToken), cance switch (exceptions.Count) { case 0: - await foreach (var (id, state) in _azureProxy.ListTasksAsync(Pool.PoolId, new ODATADetailLevel { SelectClause = "id", FilterClause = "state eq 'active'" }).Select(cloud => cloud.Id).Zip(GetFailures(cancellationToken), (id, state) => (id, state)).WithCancellation(cancellationToken)) + await foreach (var (id, state) in GetTasksAsync("id", "state eq 'active'").Select(cloud => cloud.Id).Zip(GetFailures(cancellationToken), (id, state) => (id, state)).WithCancellation(cancellationToken)) { yield return (id, state); } @@ -540,7 +552,7 @@ TaskFailureInformation PopNextStartTaskFailure() /// public IAsyncEnumerable GetCompletedTasks(CancellationToken _1) { - return _azureProxy.ListTasksAsync(Pool.PoolId, new ODATADetailLevel { SelectClause = "id,executionInfo", FilterClause = "state eq 'completed'" }); + return GetTasksAsync("id,executionInfo", "state eq 'completed'"); } // Returns false when pool/job was removed because it was not found. Returns true if the error was completely something else. diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index 99919bda2..87571cee8 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -145,6 +145,7 @@ private async ValueTask ProcessCompletedCloudTasksAsync(CancellationToken stoppi { var tasks = new List(); await ExecuteActionOnPoolsAsync("ServiceBatchTasks", async (pool, token) => tasks.AddRange(await pool.GetCompletedTasks(token).ToListAsync(token)), stoppingToken); + logger.LogDebug("ProcessCompletedCloudTasksAsync found {CompletedTasks} completed tasks.", tasks.Count); await OrchestrateTesTasksOnBatchAsync( "Completed", diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 87d74c4f7..861263a9c 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -221,5 +221,8 @@ public async Task CreateBatchPoolAsync(BatchModels.Pool poolInf /// public Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeSpan interval, IAzureProxy.BatchPoolAutoScaleFormulaFactory formulaFactory, CancellationToken cancellationToken) => azureProxy.EnableBatchPoolAutoScaleAsync(poolId, preemptable, interval, formulaFactory, cancellationToken); + + /// + public Task EvaluateAutoScaleAsync(string poolId, string autoscaleFormula, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.EvaluateAutoScaleAsync(poolId, autoscaleFormula, ct), cancellationToken); } } diff --git a/src/TesApi.Web/IAzureProxy.cs b/src/TesApi.Web/IAzureProxy.cs index e86c36306..1b85636a3 100644 --- a/src/TesApi.Web/IAzureProxy.cs +++ b/src/TesApi.Web/IAzureProxy.cs @@ -21,7 +21,7 @@ public interface IAzureProxy /// Creates a new Azure Batch job for /// /// - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A for controlling the lifetime of the asynchronous operation. Task CreateBatchJobAsync(PoolInformation poolInformation, CancellationToken cancellationToken); /// @@ -30,14 +30,14 @@ public interface IAzureProxy /// /// /// - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A for controlling the lifetime of the asynchronous operation. Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, PoolInformation poolInformation, CancellationToken cancellationToken); /// /// Terminates and deletes an Azure Batch job for /// /// - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A for controlling the lifetime of the asynchronous operation. Task DeleteBatchJobAsync(PoolInformation poolInformation, CancellationToken cancellationToken); /// @@ -60,7 +60,7 @@ public interface IAzureProxy /// Deletes an Azure Batch job for Autopools /// /// The unique TES task ID - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A for controlling the lifetime of the asynchronous operation. Task DeleteBatchJobAsync(string taskId, CancellationToken cancellationToken); /// @@ -68,7 +68,7 @@ public interface IAzureProxy /// /// The unique TES task ID /// The batch job that contains the task - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A for controlling the lifetime of the asynchronous operation. Task TerminateBatchTaskAsync(string taskId, string jobId, CancellationToken cancellationToken); /// @@ -76,7 +76,7 @@ public interface IAzureProxy /// /// The unique TES task ID /// The batch job that contains the task - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A for controlling the lifetime of the asynchronous operation. Task DeleteBatchTaskAsync(string taskId, string jobId, CancellationToken cancellationToken); /// @@ -168,7 +168,7 @@ public interface IAzureProxy /// These jobs are active for prolonged period of time, have auto pool, NoAction termination option, and no tasks /// /// - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A for controlling the lifetime of the asynchronous operation. /// List of Batch job ids Task> ListOrphanedJobsToDeleteAsync(TimeSpan minJobAge, CancellationToken cancellationToken); @@ -177,7 +177,7 @@ public interface IAzureProxy /// /// /// - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A for controlling the lifetime of the asynchronous operation. /// Active pool ids Task> GetActivePoolIdsAsync(string prefix, TimeSpan minAge, CancellationToken cancellationToken); @@ -191,7 +191,7 @@ public interface IAzureProxy /// /// Gets the list of pool ids referenced by the jobs /// - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A for controlling the lifetime of the asynchronous operation. /// Pool ids Task> GetPoolIdsReferencedByJobsAsync(CancellationToken cancellationToken); @@ -199,14 +199,14 @@ public interface IAzureProxy /// Deletes the specified pool /// /// The id of the pool. - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A for controlling the lifetime of the asynchronous operation. Task DeleteBatchPoolAsync(string poolId, CancellationToken cancellationToken); /// /// Retrieves the specified pool /// /// The of the pool to retrieve. - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A for controlling the lifetime of the asynchronous operation. /// A Microsoft.Azure.Batch.DetailLevel used for controlling which properties are retrieved from the service. /// Task GetBatchPoolAsync(string poolId, CancellationToken cancellationToken, DetailLevel detailLevel = default); @@ -215,7 +215,7 @@ public interface IAzureProxy /// Retrieves the specified batch job. /// /// The of the job to retrieve. - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A for controlling the lifetime of the asynchronous operation. /// A Microsoft.Azure.Batch.DetailLevel used for controlling which properties are retrieved from the service. /// Task GetBatchJobAsync(string jobId, CancellationToken cancellationToken, DetailLevel detailLevel = default); @@ -241,7 +241,7 @@ public interface IAzureProxy /// /// The id of the pool. /// Enumerable list of s to delete. - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A for controlling the lifetime of the asynchronous operation. /// Task DeleteBatchComputeNodesAsync(string poolId, IEnumerable computeNodes, CancellationToken cancellationToken); @@ -249,7 +249,7 @@ public interface IAzureProxy /// Gets the allocation state and numbers of targeted and current compute nodes /// /// The id of the pool. - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A for controlling the lifetime of the asynchronous operation. /// Task<(AllocationState? AllocationState, bool? AutoScaleEnabled, int? TargetLowPriority, int? CurrentLowPriority, int? TargetDedicated, int? CurrentDedicated)> GetFullAllocationStateAsync(string poolId, CancellationToken cancellationToken); @@ -278,7 +278,7 @@ public interface IAzureProxy /// Disables AutoScale in a Batch Pool ///
/// The id of the pool. - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A for controlling the lifetime of the asynchronous operation. /// Task DisableBatchPoolAutoScaleAsync(string poolId, CancellationToken cancellationToken); @@ -289,10 +289,26 @@ public interface IAzureProxy /// Type of compute nodes: false if dedicated, otherwise true. /// The interval for periodic reevaluation of the formula. /// A factory function that generates an auto-scale formula. - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A for controlling the lifetime of the asynchronous operation. /// Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeSpan interval, BatchPoolAutoScaleFormulaFactory formulaFactory, CancellationToken cancellationToken); + /// + /// Gets the result of evaluating an automatic scaling formula on the specified pool. This + /// is primarily for validating an autoscale formula, as it simply returns the result + /// without applying the formula to the pool. + /// + /// The id of the pool. + /// The formula to be evaluated on the pool. + /// A for controlling the lifetime of the asynchronous operation. + /// The result of evaluating the on the specified pool. + /// + /// The formula is validated and its results calculated, but is not applied to the pool. To apply the formula to the pool, use . + /// This method does not change any state of the pool, and does not affect the or . + /// The evaluate operation runs asynchronously. + /// + Task EvaluateAutoScaleAsync(string poolId, string autoscaleFormula, CancellationToken cancellationToken); + /// /// Describes a function to generate autoscale formulas /// diff --git a/src/TesApi.Web/Management/Batch/ArmBatchPoolManager.cs b/src/TesApi.Web/Management/Batch/ArmBatchPoolManager.cs index 9418154c2..15fe452ff 100644 --- a/src/TesApi.Web/Management/Batch/ArmBatchPoolManager.cs +++ b/src/TesApi.Web/Management/Batch/ArmBatchPoolManager.cs @@ -43,18 +43,14 @@ public async Task CreateBatchPoolAsync(Pool poolInfo, bool isPr { var batchManagementClient = await azureClientsFactory.CreateBatchAccountManagementClient(cancellationToken); - logger.LogInformation("Creating manual batch pool named {PoolName} with vmSize {PoolVmSize} and low priority {IsPreemptable}", poolInfo.Name, poolInfo.VmSize, isPreemptable); - var pool = await batchManagementClient.Pool.CreateAsync(azureClientsFactory.BatchAccountInformation.ResourceGroupName, azureClientsFactory.BatchAccountInformation.Name, poolInfo.Name, poolInfo, cancellationToken: cancellationToken); - logger.LogInformation("Successfully created manual batch pool named {PoolName} with vmSize {PoolVmSize} and low priority {IsPreemptable}", poolInfo.Name, poolInfo.VmSize, isPreemptable); - return new PoolInformation() { PoolId = pool.Name }; } catch (Exception exc) { var batchError = Newtonsoft.Json.JsonConvert.SerializeObject((exc as Microsoft.Azure.Batch.Common.BatchException)?.RequestInformation?.BatchError); - logger.LogError(exc, "Error trying to create manual batch pool named {PoolName} with vmSize {PoolVmSize} and low priority {IsPreemptable}. Batch error: {BatchError}", poolInfo.Name, poolInfo.VmSize, isPreemptable, batchError); + logger.LogError(exc, "Error trying to create batch pool named {PoolName} with vmSize {PoolVmSize} and low priority {IsPreemptable}. Batch error: {BatchError}", poolInfo.Name, poolInfo.VmSize, isPreemptable, batchError); throw; } } diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 82618fbd1..70c3ccd5f 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -142,7 +142,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< if (0 == tesTasks.Length) { - logger.LogInformation("OrchestrateTesTasksOnBatch({Poll}) skipped.", pollName); + logger.LogDebug("OrchestrateTesTasksOnBatch({Poll}) skipped.", pollName); return; } @@ -274,7 +274,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< await batchScheduler.FlushPoolsAsync(pools, stoppingToken); } - logger.LogInformation("OrchestrateTesTasksOnBatch({Poll}) for {TaskCount} tasks completed in {TotalSeconds} seconds.", pollName, tesTasks.Length, DateTime.UtcNow.Subtract(startTime).TotalSeconds); + logger.LogDebug("OrchestrateTesTasksOnBatch({Poll}) for {TaskCount} tasks completed in {TotalSeconds} seconds.", pollName, tesTasks.Length, DateTime.UtcNow.Subtract(startTime).TotalSeconds); } } } diff --git a/src/TesApi.Web/appsettings.json b/src/TesApi.Web/appsettings.json index b98e90920..4f5fc0b80 100644 --- a/src/TesApi.Web/appsettings.json +++ b/src/TesApi.Web/appsettings.json @@ -1,6 +1,13 @@ { "Logging": { "LogLevel": { + "Azure": "Warning", + "Microsoft": "Warning", + "TesApi.Web.AzureProxy": "Debug", + "TesApi.Web.BatchPool": "Debug", + "TesApi.Web.BatchPoolService": "Debug", + "TesApi.Web.BatchScheduler": "Debug", + "TesApi.Web.Scheduler": "Debug", "Default": "Information" } }, From 3456c4af9599208fd8d5c1855538fb8c3f816a24 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 5 Oct 2023 08:34:09 -0700 Subject: [PATCH 021/202] Additional logging --- src/TesApi.Web/BatchPool.cs | 4 +--- src/TesApi.Web/BatchPoolService.cs | 7 +++++++ src/TesApi.Web/BatchScheduler.cs | 2 +- src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs | 4 ++-- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 7fe6dd694..cb0936ab0 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -551,9 +551,7 @@ TaskFailureInformation PopNextStartTaskFailure() /// public IAsyncEnumerable GetCompletedTasks(CancellationToken _1) - { - return GetTasksAsync("id,executionInfo", "state eq 'completed'"); - } + => GetTasksAsync("id,executionInfo", "state eq 'completed'"); // Returns false when pool/job was removed because it was not found. Returns true if the error was completely something else. private async ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken cancellationToken) diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index 87571cee8..8e4e211db 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -162,13 +162,20 @@ async IAsyncEnumerable GetTesTasks([EnumeratorCancellation] Cancellatio TesTask tesTask = default; if (await repository.TryGetItemAsync(id, cancellationToken, task => tesTask = task) && tesTask is not null) { + logger.LogDebug("Completing task {TesTask}.", tesTask.Id); yield return tesTask; } + else + { + logger.LogDebug("Could not find task {TesTask}.", tesTask.Id); + yield return null; + } } } AzureBatchTaskState GetCompletedBatchState(CloudTask task) { + logger.LogDebug("Getting batch task state from completed task {TesTask}.", task.Id); return task.ExecutionInformation.Result switch { Microsoft.Azure.Batch.Common.TaskExecutionResult.Success => new( diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index a15927118..c3cd8fde9 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -373,7 +373,7 @@ public async Task UploadTaskRunnerIfNeeded(CancellationToken cancellationToken) ArgumentNullException.ThrowIfNull(taskStates); return taskStates.Zip(tesTasks, (TaskState, TesTask) => (TaskState, TesTask)) - .Where(entry => entry.TesTask.IsActiveState()) // Removes already terminal TesTasks + .Where(entry => entry.TesTask?.IsActiveState() ?? false) // Removes already terminal TesTasks from being further processed. .Select(entry => (entry.TesTask, IsModifiedAsync: WrapHandleTesTaskTransitionAsync(entry.TesTask, entry.TaskState, cancellationToken))) .WhenEach(cancellationToken, tuple => tuple.IsModifiedAsync); diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 70c3ccd5f..6f44045af 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -140,7 +140,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< var pools = new HashSet(); var tesTasks = await (await tesTaskGetter(stoppingToken)).ToArrayAsync(stoppingToken); - if (0 == tesTasks.Length) + if (tesTasks.All(task => task is null)) { logger.LogDebug("OrchestrateTesTasksOnBatch({Poll}) skipped.", pollName); return; @@ -274,7 +274,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< await batchScheduler.FlushPoolsAsync(pools, stoppingToken); } - logger.LogDebug("OrchestrateTesTasksOnBatch({Poll}) for {TaskCount} tasks completed in {TotalSeconds} seconds.", pollName, tesTasks.Length, DateTime.UtcNow.Subtract(startTime).TotalSeconds); + logger.LogDebug("OrchestrateTesTasksOnBatch({Poll}) for {TaskCount} tasks completed in {TotalSeconds} seconds.", pollName, tesTasks.Where(task => task is not null).Count(), DateTime.UtcNow.Subtract(startTime).TotalSeconds); } } } From 33fff3e2a1883d95b84beb71153ff77573f493a8 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 5 Oct 2023 12:29:55 -0700 Subject: [PATCH 022/202] Use actual relatopnship between cloudtask and testasks ids --- src/TesApi.Tests/BatchPoolTests.cs | 2 +- src/TesApi.Web/AzureProxy.cs | 8 ++++---- src/TesApi.Web/BatchPool.cs | 2 +- src/TesApi.Web/BatchPoolService.cs | 8 ++++---- src/TesApi.Web/BatchScheduler.cs | 11 +++++++++-- src/TesApi.Web/IBatchScheduler.cs | 7 +++++++ 6 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/TesApi.Tests/BatchPoolTests.cs b/src/TesApi.Tests/BatchPoolTests.cs index 14ffd62d0..2836161f6 100644 --- a/src/TesApi.Tests/BatchPoolTests.cs +++ b/src/TesApi.Tests/BatchPoolTests.cs @@ -39,7 +39,7 @@ public async Task RotateMarksPoolUnavailableWhenRotateIntervalHasPassed() { var azureProxy = AzureProxyReturnValues.Get(); azureProxy.AzureProxyGetComputeNodeAllocationState = id => (Microsoft.Azure.Batch.Common.AllocationState.Steady, true, 0, 0, 1, 1); - azureProxy.AzureProxyListTasks = (jobId, detailLevel) => AsyncEnumerable.Empty().Append(GenerateTask(jobId, "job1")); + azureProxy.AzureProxyListTasks = (jobId, detailLevel) => AsyncEnumerable.Empty().Append(GenerateTask(jobId, "task1-1")); azureProxy.AzureProxyListComputeNodesAsync = (i, d) => AsyncEnumerable.Empty(); var services = GetServiceProvider(azureProxy); var pool = await AddPool(services.GetT(), false); diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 122e26d19..3fce4afb8 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -268,11 +268,11 @@ public async Task TerminateBatchTaskAsync(string tesTaskId, string jobId, Cancel SelectClause = "id" }; - List batchTasksToDelete = default; + List batchTasksToTerminate = default; try { - batchTasksToDelete = await batchClient.JobOperations.ListTasks(jobId, jobFilter).ToAsyncEnumerable().ToListAsync(cancellationToken); + batchTasksToTerminate = await batchClient.JobOperations.ListTasks(jobId, jobFilter).ToAsyncEnumerable().ToListAsync(cancellationToken); } catch (BatchException ex) when (ex.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException bee && "JobNotFound".Equals(bee.Body?.Code, StringComparison.InvariantCultureIgnoreCase)) { @@ -280,12 +280,12 @@ public async Task TerminateBatchTaskAsync(string tesTaskId, string jobId, Cancel return; // Task cannot exist if the job is not found. } - if (batchTasksToDelete.Count > 1) + if (batchTasksToTerminate.Count > 1) { logger.LogWarning("Found more than one active task for TES task {TesTask}", tesTaskId); } - foreach (var task in batchTasksToDelete) + foreach (var task in batchTasksToTerminate) { logger.LogInformation("Terminating task {BatchTask}", task.Id); await batchNodeNotReadyRetryPolicy.ExecuteAsync(ct => task.TerminateAsync(cancellationToken: ct), cancellationToken); diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index cb0936ab0..020ed0a9a 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -141,12 +141,12 @@ private async ValueTask ServicePoolGetResizeErrorsAsync(CancellationToken cancel // Errors to ignore case PoolResizeErrorCodes.RemoveNodesFailed: case PoolResizeErrorCodes.CommunicationEnabledPoolReachedMaxVMCount: - case PoolResizeErrorCodes.AccountSpotCoreQuotaReached: case PoolResizeErrorCodes.AllocationTimedOut: break; // Errors that sometimes require mitigation case PoolResizeErrorCodes.AccountCoreQuotaReached: + case PoolResizeErrorCodes.AccountSpotCoreQuotaReached: case PoolResizeErrorCodes.AccountLowPriorityCoreQuotaReached: if (autoScaleTransitionCutoff > Creation && pool.AllocationStateTransitionTime < autoScaleTransitionCutoff && (await _azureProxy.EvaluateAutoScaleAsync(pool.Id, pool.AutoScaleFormula, cancellationToken: cancellationToken))?.Error is not null) diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index 8e4e211db..de2e65878 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -118,7 +118,7 @@ async ValueTask ProcessFailures(IAsyncEnumerable<(string taskId, AzureBatchTaskS await foreach (var (id, state) in failures.WithCancellation(cancellationToken)) { TesTask tesTask = default; - if (await repository.TryGetItemAsync(id, cancellationToken, task => tesTask = task) && tesTask is not null) + if (await repository.TryGetItemAsync(batchScheduler.GetTesTaskIdFromCloudTaskId(id), cancellationToken, task => tesTask = task) && tesTask is not null) { list.Add((tesTask, state)); } @@ -157,7 +157,7 @@ await OrchestrateTesTasksOnBatchAsync( async IAsyncEnumerable GetTesTasks([EnumeratorCancellation] CancellationToken cancellationToken) { - foreach (var id in tasks.Select(t => t.Id)) + foreach (var id in tasks.Select(t => batchScheduler.GetTesTaskIdFromCloudTaskId(t.Id))) { TesTask tesTask = default; if (await repository.TryGetItemAsync(id, cancellationToken, task => tesTask = task) && tesTask is not null) @@ -167,7 +167,7 @@ async IAsyncEnumerable GetTesTasks([EnumeratorCancellation] Cancellatio } else { - logger.LogDebug("Could not find task {TesTask}.", tesTask.Id); + logger.LogDebug("Could not find task {TesTask}.", id); yield return null; } } @@ -175,7 +175,7 @@ async IAsyncEnumerable GetTesTasks([EnumeratorCancellation] Cancellatio AzureBatchTaskState GetCompletedBatchState(CloudTask task) { - logger.LogDebug("Getting batch task state from completed task {TesTask}.", task.Id); + logger.LogDebug("Getting batch task state from completed task {TesTask}.", batchScheduler.GetTesTaskIdFromCloudTaskId(task.Id)); return task.ExecutionInformation.Result switch { Microsoft.Azure.Batch.Common.TaskExecutionResult.Success => new( diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index c3cd8fde9..f258b7eed 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -449,6 +449,13 @@ private static string StandardizeStartTaskPath(string startTaskPath, string defa private static bool IsCromwellCommandScript(TesInput inputFile) => (inputFile.Name?.Equals("commandScript") ?? false) && (inputFile.Description?.EndsWith(".commandScript") ?? false) && inputFile.Type == TesFileType.FILEEnum && inputFile.Path.EndsWith($"/{CromwellScriptFileName}"); + /// + public string GetTesTaskIdFromCloudTaskId(string cloudTaskId) + { + var separatorIndex = cloudTaskId.LastIndexOf('-'); + return separatorIndex == -1 ? cloudTaskId : cloudTaskId[..separatorIndex]; + } + /// public async IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { @@ -568,9 +575,9 @@ private static bool IsCromwellCommandScript(TesInput inputFile) cancellationToken: cancellationToken) ).Pool; - var jobOrTaskId = $"{tesTask.Id}-{tesTask.Logs.Count}"; + var cloudTaskId = $"{tesTask.Id}-{tesTask.Logs.Count}"; tesTask.PoolId = poolInformation.PoolId; - var cloudTask = await ConvertTesTaskToBatchTaskUsingRunnerAsync(jobOrTaskId, tesTask, cancellationToken); + var cloudTask = await ConvertTesTaskToBatchTaskUsingRunnerAsync(cloudTaskId, tesTask, cancellationToken); logger.LogInformation(@"Creating batch task for TES task {TesTaskId}. Using VM size {VmSize}.", tesTask.Id, virtualMachineInfo.VmSize); await azureProxy.AddBatchTaskAsync(tesTask.Id, cloudTask, poolInformation, cancellationToken); diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index 999ea0a8f..1d0ecc690 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -94,5 +94,12 @@ public interface IBatchScheduler /// /// ValueTask FlushPoolsAsync(IEnumerable assignedPools, CancellationToken cancellationToken); + + /// + /// Gets the from a . + /// + /// . + /// . + string GetTesTaskIdFromCloudTaskId(string cloudTaskId); } } From 372ded650a7a6199b4ddbaf7601d55632f221b72 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 5 Oct 2023 21:13:59 -0700 Subject: [PATCH 023/202] Removed last vestiges of support for AutoPools --- ...{BatchScheduler.BatchPools.cs => BatchPools.BatchScheduler.cs} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/TesApi.Web/{BatchScheduler.BatchPools.cs => BatchPools.BatchScheduler.cs} (100%) diff --git a/src/TesApi.Web/BatchScheduler.BatchPools.cs b/src/TesApi.Web/BatchPools.BatchScheduler.cs similarity index 100% rename from src/TesApi.Web/BatchScheduler.BatchPools.cs rename to src/TesApi.Web/BatchPools.BatchScheduler.cs From 7a3092461d723d29556569958ffe0b948478fe2a Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 5 Oct 2023 21:15:39 -0700 Subject: [PATCH 024/202] Removed last vestiges of support for AutoPools --- src/TesApi.Tests/BatchPoolTests.cs | 97 ++++--- src/TesApi.Tests/BatchSchedulerTests.cs | 42 +-- .../TerraBatchPoolManagerTests.cs | 16 +- src/TesApi.Web/AzureProxy.cs | 111 ++------ src/TesApi.Web/BatchPool.cs | 64 +++-- src/TesApi.Web/BatchPoolService.cs | 2 +- src/TesApi.Web/BatchPools.BatchScheduler.cs | 18 +- src/TesApi.Web/BatchScheduler.cs | 268 ++++++------------ .../CachingWithRetriesAzureProxy.cs | 33 +-- src/TesApi.Web/IAzureProxy.cs | 74 ++--- src/TesApi.Web/IBatchPool.cs | 4 +- .../Management/Batch/ArmBatchPoolManager.cs | 4 +- .../Management/Batch/IBatchPoolManager.cs | 4 +- .../Management/Batch/TerraBatchPoolManager.cs | 4 +- 14 files changed, 265 insertions(+), 476 deletions(-) diff --git a/src/TesApi.Tests/BatchPoolTests.cs b/src/TesApi.Tests/BatchPoolTests.cs index 2836161f6..961023b87 100644 --- a/src/TesApi.Tests/BatchPoolTests.cs +++ b/src/TesApi.Tests/BatchPoolTests.cs @@ -93,7 +93,7 @@ public async Task RemovePoolIfEmptyDeletesPoolIfPoolIsNotAvailableAndHasNoComput void DeletePool(string poolId, System.Threading.CancellationToken cancellationToken) { - Assert.AreEqual(poolId, pool.Pool.PoolId); + Assert.AreEqual(poolId, pool.Id); isDeleted = true; } } @@ -106,14 +106,14 @@ public async Task ServicePoolGetResizeErrorsResetsAutoScalingWhenBatchStopsEvalu var pool = await AddPool(services.GetT(), false); azureProxy.SetPoolState( - pool.Pool.PoolId, + pool.Id, enableAutoScale: true, autoScaleRun: new(DateTime.UtcNow, error: new("ErrorCode", "Message"))); await pool.ServicePoolAsync(BatchPool.ServiceKind.GetResizeErrors); await pool.ServicePoolAsync(BatchPool.ServiceKind.ManagePoolScaling); - services.AzureProxy.Verify(a => a.DisableBatchPoolAutoScaleAsync(pool.Pool.PoolId, It.IsAny())); + services.AzureProxy.Verify(a => a.DisableBatchPoolAutoScaleAsync(pool.Id, It.IsAny())); } [TestMethod] @@ -125,14 +125,14 @@ public async Task ServicePoolGetResizeErrorsResetsAutoScalingWhenBatchStopsEvalu pool.TimeShift(7 * BatchPool.AutoScaleEvaluationInterval); azureProxy.SetPoolState( - pool.Pool.PoolId, + pool.Id, enableAutoScale: true, autoScaleRun: new(DateTime.UtcNow - (6 * BatchPool.AutoScaleEvaluationInterval))); await pool.ServicePoolAsync(BatchPool.ServiceKind.GetResizeErrors); await pool.ServicePoolAsync(BatchPool.ServiceKind.ManagePoolScaling); - services.AzureProxy.Verify(a => a.DisableBatchPoolAutoScaleAsync(pool.Pool.PoolId, It.IsAny())); + services.AzureProxy.Verify(a => a.DisableBatchPoolAutoScaleAsync(pool.Id, It.IsAny())); } [TestMethod] @@ -145,7 +145,7 @@ public async Task ServicePoolGetResizeErrorsResetsAutoScalingWhenBatchStopsEvalu pool.TimeShift(12 * BatchPool.AutoScaleEvaluationInterval); azureProxy.SetPoolState( - pool.Pool.PoolId, + pool.Id, enableAutoScale: true, resizeErrors: Enumerable.Repeat(new(Microsoft.Azure.Batch.Common.PoolResizeErrorCodes.AccountCoreQuotaReached, "Core quota reached."), 1).ToList(), allocationStateTransitionTime: DateTime.UtcNow - (11 * BatchPool.AutoScaleEvaluationInterval)); @@ -153,7 +153,7 @@ public async Task ServicePoolGetResizeErrorsResetsAutoScalingWhenBatchStopsEvalu await pool.ServicePoolAsync(BatchPool.ServiceKind.GetResizeErrors); await pool.ServicePoolAsync(BatchPool.ServiceKind.ManagePoolScaling); - services.AzureProxy.Verify(a => a.DisableBatchPoolAutoScaleAsync(pool.Pool.PoolId, It.IsAny())); + services.AzureProxy.Verify(a => a.DisableBatchPoolAutoScaleAsync(pool.Id, It.IsAny())); } private static TestServices.TestServiceProvider GetServiceProvider(AzureProxyReturnValues azureProxyReturn = default) @@ -195,7 +195,14 @@ internal static AzureProxyReturnValues Get() internal bool PoolStateExists(string poolId) => poolState.ContainsKey(poolId); - private readonly Dictionary ResizeErrors, IList PoolMetadata)> poolState = new(); + private record PoolState(int? CurrentDedicatedNodes, int? CurrentLowPriorityNodes, Microsoft.Azure.Batch.Common.AllocationState? AllocationState, Microsoft.Azure.Batch.Protocol.Models.AutoScaleRun AutoScaleRun, DateTime? AllocationStateTransitionTime, IList ResizeErrors, IList PoolMetadata) + { + public int? TargetDedicatedNodes { get; set; } + public int? TargetLowPriorityNodes { get; set; } + public bool? EnableAutoScale { get; set; } + } + + private readonly Dictionary poolState = new(); internal void SetPoolState( string id, @@ -225,24 +232,27 @@ internal void SetPoolState( } } - poolState[id] = ( + poolState[id] = new( currentDedicatedNodes ?? state.CurrentDedicatedNodes, currentLowPriorityNodes ?? state.CurrentLowPriorityNodes, - targetDedicatedNodes ?? state.TargetDedicatedNodes, - targetLowPriorityNodes ?? state.TargetLowPriorityNodes, allocationState ?? state.AllocationState, autoScaleRun ?? state.AutoScaleRun, - enableAutoScale ?? state.EnableAutoScale, allocationStateTransitionTime ?? state.AllocationStateTransitionTime, resizeErrors ?? state.ResizeErrors, - metadata.Count == 0 ? null : metadata.Select(ConvertMetadata).ToList()); + metadata.Count == 0 ? null : metadata.Select(ConvertMetadata).ToList()) + { + TargetDedicatedNodes = targetDedicatedNodes ?? state.TargetDedicatedNodes, + TargetLowPriorityNodes = targetLowPriorityNodes ?? state.TargetLowPriorityNodes, + EnableAutoScale = enableAutoScale ?? state.EnableAutoScale + }; static Microsoft.Azure.Batch.MetadataItem ConvertMetadata(KeyValuePair pair) => new(pair.Key, pair.Value); } else { - poolState.Add(id, (currentDedicatedNodes, currentLowPriorityNodes, targetDedicatedNodes, targetLowPriorityNodes, allocationState, autoScaleRun, enableAutoScale, allocationStateTransitionTime, resizeErrors, poolMetadata)); + poolState.Add(id, new(currentDedicatedNodes, currentLowPriorityNodes, allocationState, autoScaleRun, allocationStateTransitionTime, resizeErrors, poolMetadata) + { TargetDedicatedNodes = targetDedicatedNodes, TargetLowPriorityNodes = targetLowPriorityNodes, EnableAutoScale = enableAutoScale }); } } @@ -264,21 +274,25 @@ internal void SetPoolAutoScaleTargets(string id, int? targetDedicatedNodes = def state.TargetLowPriorityNodes = targetLowPriorityNodes ?? state.TargetLowPriorityNodes ?? 0; } - internal PoolInformation CreateBatchPoolImpl(Pool pool) + internal CloudPool CreateBatchPoolImpl(Pool pool) { - poolState.Add(pool.Name, - (0, - 0, - pool.ScaleSettings?.FixedScale?.TargetDedicatedNodes ?? 0, - pool.ScaleSettings?.FixedScale?.TargetLowPriorityNodes ?? 0, - Microsoft.Azure.Batch.Common.AllocationState.Steady, - default, - pool.ScaleSettings?.AutoScale is not null, - default, - default, - pool.Metadata?.Select(ConvertMetadata).ToList())); - - return new() { PoolId = pool.Name }; + PoolState state = new( + CurrentDedicatedNodes: 0, + CurrentLowPriorityNodes: 0, + AllocationState: Microsoft.Azure.Batch.Common.AllocationState.Steady, + AutoScaleRun: default, + AllocationStateTransitionTime: default, + ResizeErrors: default, + PoolMetadata: pool.Metadata?.Select(ConvertMetadata).ToList()) + { + TargetDedicatedNodes = pool.ScaleSettings?.FixedScale?.TargetDedicatedNodes ?? 0, + TargetLowPriorityNodes = pool.ScaleSettings?.FixedScale?.TargetLowPriorityNodes ?? 0, + EnableAutoScale = pool.ScaleSettings?.AutoScale is not null + }; + + poolState.Add(pool.Name, state); + + return GetPoolFromState(pool.Name, state); static Microsoft.Azure.Batch.MetadataItem ConvertMetadata(Microsoft.Azure.Management.Batch.Models.MetadataItem item) => new(item.Name, item.Value); @@ -291,19 +305,22 @@ internal CloudPool GetBatchPoolImpl(string poolId) return GeneratePool(poolId); } - return GeneratePool( - poolId, - currentDedicatedNodes: state.CurrentDedicatedNodes, - currentLowPriorityNodes: state.CurrentLowPriorityNodes, - targetDedicatedNodes: state.TargetDedicatedNodes, - targetLowPriorityNodes: state.TargetLowPriorityNodes, - allocationState: state.AllocationState, - allocationStateTransitionTime: state.AllocationStateTransitionTime, - resizeErrors: state.ResizeErrors, - autoScaleRun: state.AutoScaleRun, - enableAutoScale: state.EnableAutoScale, - metadata: state.PoolMetadata); + return GetPoolFromState(poolId, state); } + + private static CloudPool GetPoolFromState(string poolId, PoolState poolState) + => GeneratePool( + id: poolId, + currentDedicatedNodes: poolState.CurrentDedicatedNodes, + currentLowPriorityNodes: poolState.CurrentLowPriorityNodes, + targetDedicatedNodes: poolState.TargetDedicatedNodes, + targetLowPriorityNodes: poolState.TargetLowPriorityNodes, + allocationState: poolState.AllocationState, + allocationStateTransitionTime: poolState.AllocationStateTransitionTime, + resizeErrors: poolState.ResizeErrors, + autoScaleRun: poolState.AutoScaleRun, + enableAutoScale: poolState.EnableAutoScale, + metadata: poolState.PoolMetadata); } private static Action> GetMockSkuInfoProvider(AzureProxyReturnValues azureProxyReturnValues) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index d956770d6..a2f715fcd 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -46,7 +46,7 @@ public async Task LocalPoolCacheAccessesNewPoolsAfterAllPoolsRemovedWithSameKey( Assert.IsNotNull(pool); Assert.AreEqual(1, batchScheduler.GetPoolGroupKeys().Count()); - Assert.IsTrue(batchScheduler.TryGetPool(pool.Pool.PoolId, out var pool1)); + Assert.IsTrue(batchScheduler.TryGetPool(pool.Id, out var pool1)); Assert.AreSame(pool, pool1); } @@ -67,7 +67,7 @@ public async Task GetOrAddDoesNotAddExistingAvailablePool() Assert.AreEqual(batchScheduler.GetPools().Count(), count); Assert.AreEqual(batchScheduler.GetPoolGroupKeys().Count(), keyCount); //Assert.AreSame(info, pool); - Assert.AreEqual(info.Pool.PoolId, pool.Pool.PoolId); + Assert.AreEqual(info.Id, pool.Id); serviceProvider.AzureProxy.Verify(mock => mock.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny()), Times.Once); } @@ -89,7 +89,7 @@ public async Task GetOrAddDoesAddWithExistingUnavailablePool() Assert.AreNotEqual(batchScheduler.GetPools().Count(), count); Assert.AreEqual(batchScheduler.GetPoolGroupKeys().Count(), keyCount); //Assert.AreNotSame(info, pool); - Assert.AreNotEqual(info.Pool.PoolId, pool.Pool.PoolId); + Assert.AreNotEqual(info.Id, pool.Id); } @@ -100,11 +100,11 @@ public async Task TryGetReturnsTrueAndCorrectPool() var batchScheduler = serviceProvider.GetT() as BatchScheduler; var info = await AddPool(batchScheduler); - var result = batchScheduler.TryGetPool(info.Pool.PoolId, out var pool); + var result = batchScheduler.TryGetPool(info.Id, out var pool); Assert.IsTrue(result); //Assert.AreSame(infoPoolId, pool); - Assert.AreEqual(info.Pool.PoolId, pool.Pool.PoolId); + Assert.AreEqual(info.Id, pool.Id); } [TestMethod] @@ -161,7 +161,7 @@ public async Task UnavailablePoolsAreRemoved() await pool.ServicePoolAsync(BatchPool.ServiceKind.RemovePoolIfEmpty); - Assert.AreEqual(pool.Pool.PoolId, poolId); + Assert.AreEqual(pool.Id, poolId); Assert.IsFalse(batchScheduler.IsPoolAvailable("key1")); Assert.IsFalse(batchScheduler.GetPools().Any()); } @@ -357,8 +357,8 @@ public Task AddBatchTaskHandlesAzureBatchPoolCreationExceptionViaJobCreation() return AddBatchTaskHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) - => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchJobAsync(It.IsAny(), It.IsAny())) - .Callback((poolInfo, cancellationToken) + => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchJobAsync(It.IsAny(), It.IsAny())) + .Callback((_1, _2) => throw new Microsoft.Rest.Azure.CloudException("No job for you.") { Body = new() { Code = BatchErrorCodeStrings.OperationTimedOut } })); void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception exception)> logs) @@ -381,7 +381,7 @@ public Task AddBatchTaskHandlesAzureBatchPoolCreationExceptionViaPoolCreation() (Action, Action>) Arranger(AzureProxyReturnValues _1) => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) - .Callback((poolInfo, isPreemptible, cancellationToken) + .Callback((_1, _2, _3) => throw new Microsoft.Rest.Azure.CloudException("No job for you.") { Body = new() { Code = BatchErrorCodeStrings.OperationTimedOut } })); void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception exception)> logs) @@ -492,8 +492,8 @@ public Task AddBatchTaskHandlesBatchClientException() return AddBatchTaskHandlesExceptions(TesState.SYSTEMERROREnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) - => (default, azureProxy => azureProxy.Setup(b => b.AddBatchTaskAsync(It.IsAny(), It.IsAny(), It.IsAny(), It.IsAny())) - .Callback((tesTaskId, cloudTask, poolInfo, cancellationToken) + => (default, azureProxy => azureProxy.Setup(b => b.AddBatchTaskAsync(It.IsAny(), It.IsAny(), It.IsAny(), It.IsAny())) + .Callback((_1, _2, _3, _4) => throw typeof(BatchClientException) .GetConstructor(System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance, new[] { typeof(string), typeof(Exception) }) @@ -518,8 +518,8 @@ public Task AddBatchTaskHandlesBatchExceptionForJobQuota() return AddBatchTaskHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) - => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchJobAsync(It.IsAny(), It.IsAny())) - .Callback((poolInfo, cancellationToken) + => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchJobAsync(It.IsAny(), It.IsAny())) + .Callback((_1, _2) => throw new BatchException( new Mock().Object, default, @@ -629,16 +629,15 @@ public async Task BatchJobContainsExpectedBatchPoolInformation() var addBatchTaskAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.AddBatchTaskAsync)); var cloudTask = addBatchTaskAsyncInvocation?.Arguments[1] as CloudTask; - var poolInformation = addBatchTaskAsyncInvocation?.Arguments[2] as PoolInformation; + var poolId = addBatchTaskAsyncInvocation?.Arguments[2] as string; var pool = createBatchPoolAsyncInvocation?.Arguments[0] as Pool; GuardAssertsWithTesTask(tesTask, () => { - Assert.IsNull(poolInformation.AutoPoolSpecification); - Assert.IsNotNull(poolInformation.PoolId); - Assert.AreEqual("TES-hostname-edicated1-6aczoqjox53tytv3h7hxwrp5t5ne4yzs-", poolInformation.PoolId[0..^8]); + Assert.IsNotNull(poolId); + Assert.AreEqual("TES-hostname-edicated1-6aczoqjox53tytv3h7hxwrp5t5ne4yzs-", poolId[0..^8]); Assert.AreEqual("VmSizeDedicated1", pool.VmSize); - Assert.IsTrue(((BatchScheduler)batchScheduler).TryGetPool(poolInformation.PoolId, out _)); + Assert.IsTrue(((BatchScheduler)batchScheduler).TryGetPool(poolId, out _)); Assert.AreEqual(1, pool.DeploymentConfiguration.VirtualMachineConfiguration.ContainerConfiguration.ContainerRegistries.Count); }); } @@ -1924,12 +1923,13 @@ internal void AzureProxyDeleteBatchPoolImpl(string poolId, System.Threading.Canc AzureProxyDeleteBatchPool(poolId, cancellationToken); } - internal PoolInformation CreateBatchPoolImpl(Pool pool) + internal CloudPool CreateBatchPoolImpl(Pool pool) { var poolId = pool.Name; + var metadata = pool.Metadata?.Select(Convert).ToList(); - poolMetadata.Add(poolId, pool.Metadata?.Select(Convert).ToList()); - return new() { PoolId = poolId }; + poolMetadata.Add(poolId, metadata); + return BatchPoolTests.GeneratePool(id: poolId, creationTime: DateTime.UtcNow, metadata: metadata); static Microsoft.Azure.Batch.MetadataItem Convert(Microsoft.Azure.Management.Batch.Models.MetadataItem item) => new(item.Name, item.Value); diff --git a/src/TesApi.Tests/TerraBatchPoolManagerTests.cs b/src/TesApi.Tests/TerraBatchPoolManagerTests.cs index cbba97edf..ec6a5f51d 100644 --- a/src/TesApi.Tests/TerraBatchPoolManagerTests.cs +++ b/src/TesApi.Tests/TerraBatchPoolManagerTests.cs @@ -72,10 +72,10 @@ public async Task CreateBatchPoolAsync_ValidResponse() UserAccounts = new List() { new UserAccount("name", "password") } }; - var pool = await terraBatchPoolManager.CreateBatchPoolAsync(poolInfo, false, System.Threading.CancellationToken.None); + var poolId = await terraBatchPoolManager.CreateBatchPoolAsync(poolInfo, false, System.Threading.CancellationToken.None); - Assert.IsNotNull(pool); - Assert.AreEqual(terraApiStubData.PoolId, pool.PoolId); + Assert.IsNotNull(poolId); + Assert.AreEqual(terraApiStubData.PoolId, poolId); } [TestMethod] @@ -117,12 +117,12 @@ public async Task CreateBatchPoolAsync_MultipleCallsHaveDifferentNameAndResource }, }; - var pool = await terraBatchPoolManager.CreateBatchPoolAsync(poolInfo, false, System.Threading.CancellationToken.None); + _ = await terraBatchPoolManager.CreateBatchPoolAsync(poolInfo, false, System.Threading.CancellationToken.None); var name = capturedApiCreateBatchPoolRequest.Common.Name; var resourceId = capturedApiCreateBatchPoolRequest.Common.ResourceId; - pool = await terraBatchPoolManager.CreateBatchPoolAsync(poolInfo, false, System.Threading.CancellationToken.None); + _ = await terraBatchPoolManager.CreateBatchPoolAsync(poolInfo, false, System.Threading.CancellationToken.None); Assert.AreNotEqual(name, capturedApiCreateBatchPoolRequest.Common.Name); Assert.AreNotEqual(resourceId, capturedApiCreateBatchPoolRequest.Common.ResourceId); @@ -147,12 +147,12 @@ public async Task CreateBatchPoolAsync_UserIdentityMapsCorrectly() Identity = new BatchPoolIdentity(PoolIdentityType.UserAssigned, identities) }; - var pool = await terraBatchPoolManager.CreateBatchPoolAsync(poolInfo, false, System.Threading.CancellationToken.None); + _ = await terraBatchPoolManager.CreateBatchPoolAsync(poolInfo, false, System.Threading.CancellationToken.None); var name = capturedApiCreateBatchPoolRequest.Common.Name; var resourceId = capturedApiCreateBatchPoolRequest.Common.ResourceId; - pool = await terraBatchPoolManager.CreateBatchPoolAsync(poolInfo, false, System.Threading.CancellationToken.None); + _ = await terraBatchPoolManager.CreateBatchPoolAsync(poolInfo, false, System.Threading.CancellationToken.None); Assert.AreEqual(identityName, capturedApiCreateBatchPoolRequest.AzureBatchPool.UserAssignedIdentities.SingleOrDefault().Name); } @@ -170,7 +170,7 @@ public async Task CreateBatchPoolAsync_UserIdentityInStartTaskMapsCorrectly() } }; - var pool = await terraBatchPoolManager.CreateBatchPoolAsync(poolInfo, false, System.Threading.CancellationToken.None); + _ = await terraBatchPoolManager.CreateBatchPoolAsync(poolInfo, false, System.Threading.CancellationToken.None); var captureUserIdentity = capturedApiCreateBatchPoolRequest.AzureBatchPool.StartTask.UserIdentity; diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 3fce4afb8..dcc343bbc 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -23,19 +23,13 @@ using Microsoft.WindowsAzure.Storage.Blob; using Polly; using Polly.Retry; -using Tes.Models; using TesApi.Web.Management.Batch; using TesApi.Web.Management.Configuration; using TesApi.Web.Storage; using BatchModels = Microsoft.Azure.Management.Batch.Models; using CloudTask = Microsoft.Azure.Batch.CloudTask; -using ComputeNodeState = Microsoft.Azure.Batch.Common.ComputeNodeState; using FluentAzure = Microsoft.Azure.Management.Fluent.Azure; -using JobState = Microsoft.Azure.Batch.Common.JobState; using OnAllTasksComplete = Microsoft.Azure.Batch.Common.OnAllTasksComplete; -using PoolInformation = Microsoft.Azure.Batch.PoolInformation; -using TaskExecutionInformation = Microsoft.Azure.Batch.TaskExecutionInformation; -using TaskState = Microsoft.Azure.Batch.Common.TaskState; namespace TesApi.Web { @@ -200,28 +194,28 @@ public int GetBatchActiveJobCount() } /// - public async Task CreateBatchJobAsync(PoolInformation poolInformation, CancellationToken cancellationToken) + public async Task CreateBatchJobAsync(string jobId, CancellationToken cancellationToken) { - ArgumentException.ThrowIfNullOrEmpty(poolInformation?.PoolId, nameof(poolInformation)); + ArgumentException.ThrowIfNullOrEmpty(jobId); - logger.LogInformation("TES: Creating Batch job {BatchJob}", poolInformation.PoolId); - var job = batchClient.JobOperations.CreateJob(poolInformation.PoolId, poolInformation); + logger.LogInformation("TES: Creating Batch job {BatchJob}", jobId); + var job = batchClient.JobOperations.CreateJob(jobId, new() { PoolId = jobId }); job.OnAllTasksComplete = OnAllTasksComplete.NoAction; job.OnTaskFailure = OnTaskFailure.NoAction; await job.CommitAsync(cancellationToken: cancellationToken); - logger.LogInformation("TES: Batch job {BatchJob} committed successfully", poolInformation.PoolId); + logger.LogInformation("TES: Batch job {BatchJob} committed successfully", jobId); await Task.Delay(TimeSpan.FromSeconds(2), cancellationToken); } /// - public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, PoolInformation poolInformation, CancellationToken cancellationToken) + public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, string jobId, CancellationToken cancellationToken) { - ArgumentException.ThrowIfNullOrEmpty(poolInformation?.PoolId, nameof(poolInformation)); + ArgumentException.ThrowIfNullOrEmpty(jobId); - logger.LogInformation("TES task: {TesTask} - Adding task to job {BatchJob}", tesTaskId, poolInformation.PoolId); + logger.LogInformation("TES task: {TesTask} - Adding task to job {BatchJob}", tesTaskId, jobId); var job = await batchRaceConditionJobNotFoundRetryPolicy.ExecuteAsync(ct => - batchClient.JobOperations.GetJobAsync(poolInformation.PoolId, cancellationToken: ct), + batchClient.JobOperations.GetJobAsync(jobId, cancellationToken: ct), cancellationToken); await job.AddTaskAsync(cloudTask, cancellationToken: cancellationToken); @@ -229,34 +223,11 @@ public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, PoolI } /// - public async Task DeleteBatchJobAsync(PoolInformation poolInformation, CancellationToken cancellationToken) + public async Task DeleteBatchJobAsync(string jobId, CancellationToken cancellationToken) { - ArgumentException.ThrowIfNullOrEmpty(poolInformation?.PoolId, nameof(poolInformation)); - logger.LogInformation("Deleting job {BatchJob}", poolInformation.PoolId); - await batchClient.JobOperations.DeleteJobAsync(poolInformation.PoolId, cancellationToken: cancellationToken); - } - - /// - public async Task DeleteBatchJobAsync(string tesTaskId, CancellationToken cancellationToken = default) - { - var jobFilter = new ODATADetailLevel - { - FilterClause = $"startswith(id,'{tesTaskId}{BatchJobAttemptSeparator}') and state ne 'deleting'", - SelectClause = "id" - }; - - var batchJobsToDelete = await batchClient.JobOperations.ListJobs(jobFilter).ToAsyncEnumerable().ToListAsync(cancellationToken); - - if (batchJobsToDelete.Count > 1) - { - logger.LogWarning($"Found more than one active job for TES task {tesTaskId}"); - } - - foreach (var job in batchJobsToDelete) - { - logger.LogInformation($"Deleting job {job.Id}"); - await job.DeleteAsync(cancellationToken: cancellationToken); - } + ArgumentException.ThrowIfNullOrEmpty(jobId); + logger.LogInformation("Deleting job {BatchJob}", jobId); + await batchClient.JobOperations.DeleteJobAsync(jobId, cancellationToken: cancellationToken); } /// @@ -325,47 +296,6 @@ public async Task DeleteBatchTaskAsync(string tesTaskId, string jobId, Cancellat } } - /// - public async Task> ListOldJobsToDeleteAsync(TimeSpan oldestJobAge, CancellationToken cancellationToken) - { - var filter = new ODATADetailLevel - { - FilterClause = $"state eq 'completed' and executionInfo/endTime lt DateTime'{DateTime.Today.Subtract(oldestJobAge):yyyy-MM-ddTHH:mm:ssZ}'", - SelectClause = "id" - }; - - return await batchClient.JobOperations.ListJobs(filter).ToAsyncEnumerable().Select(c => c.Id).ToListAsync(cancellationToken); - } - - /// - public async Task> ListOrphanedJobsToDeleteAsync(TimeSpan minJobAge, CancellationToken cancellationToken = default) - { - var filter = new ODATADetailLevel - { - FilterClause = $"state eq 'active' and creationTime lt DateTime'{DateTime.UtcNow.Subtract(minJobAge):yyyy-MM-ddTHH:mm:ssZ}'", - SelectClause = "id,poolInfo,onAllTasksComplete" - }; - - var noActionTesjobs = batchClient.JobOperations.ListJobs(filter).ToAsyncEnumerable() - .Where(j => j.PoolInformation?.AutoPoolSpecification?.AutoPoolIdPrefix == "TES" && j.OnAllTasksComplete == OnAllTasksComplete.NoAction); - - var noActionTesjobsWithNoTasks = noActionTesjobs.WhereAwait(async j => !await j.ListTasks().ToAsyncEnumerable().AnyAsync(cancellationToken)); - - return await noActionTesjobsWithNoTasks.Select(j => j.Id).ToListAsync(cancellationToken); - } - - /// - public async Task> GetActivePoolIdsAsync(string prefix, TimeSpan minAge, CancellationToken cancellationToken = default) - { - var activePoolsFilter = new ODATADetailLevel - { - FilterClause = $"state eq 'active' and startswith(id, '{prefix}') and creationTime lt DateTime'{DateTime.UtcNow.Subtract(minAge):yyyy-MM-ddTHH:mm:ssZ}'", - SelectClause = "id" - }; - - return (await batchClient.PoolOperations.ListPools(activePoolsFilter).ToListAsync(cancellationToken)).Select(p => p.Id); - } - /// public IAsyncEnumerable GetActivePoolsAsync(string hostName) { @@ -379,12 +309,6 @@ public IAsyncEnumerable GetActivePoolsAsync(string hostName) .Where(p => hostName.Equals(p.Metadata?.FirstOrDefault(m => BatchScheduler.PoolHostName.Equals(m.Name, StringComparison.Ordinal))?.Value, StringComparison.OrdinalIgnoreCase)); } - /// - public async Task> GetPoolIdsReferencedByJobsAsync(CancellationToken cancellationToken = default) - => (await batchClient.JobOperations.ListJobs(new ODATADetailLevel(selectClause: "executionInfo")).ToListAsync(cancellationToken)) - .Where(j => !string.IsNullOrEmpty(j.ExecutionInformation?.PoolId)) - .Select(j => j.ExecutionInformation.PoolId); - /// public Task DeleteBatchComputeNodesAsync(string poolId, IEnumerable computeNodes, CancellationToken cancellationToken = default) => batchClient.PoolOperations.RemoveFromPoolAsync(poolId, computeNodes, deallocationOption: ComputeNodeDeallocationOption.Requeue, cancellationToken: cancellationToken); @@ -535,12 +459,13 @@ private static Task GetAzureAccessTokenAsync(CancellationToken cancellat } /// - public async Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, bool isPreemptable, CancellationToken cancellationToken) + public async Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, bool isPreemptable, CancellationToken cancellationToken) { logger.LogInformation("Creating batch pool named {PoolName} with vmSize {PoolVmSize} and low priority {IsPreemptable}", poolInfo.Name, poolInfo.VmSize, isPreemptable); - var pool = await batchPoolManager.CreateBatchPoolAsync(poolInfo, isPreemptable, cancellationToken); + var poolId = await batchPoolManager.CreateBatchPoolAsync(poolInfo, isPreemptable, cancellationToken); logger.LogInformation("Successfully created batch pool named {PoolName} with vmSize {PoolVmSize} and low priority {IsPreemptable}", poolInfo.Name, poolInfo.VmSize, isPreemptable); - return pool; + + return await batchClient.PoolOperations.GetPoolAsync(poolId, detailLevel: new ODATADetailLevel { SelectClause = BatchPool.CloudPoolSelectClause }, cancellationToken: cancellationToken); } // https://learn.microsoft.com/azure/azure-resource-manager/management/move-resource-group-and-subscription#changed-resource-id @@ -601,7 +526,7 @@ public async Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, } var formula = formulaFactory(preemptable, preemptable ? currentLowPriority ?? 0 : currentDedicated ?? 0); - logger.LogDebug("Setting Pool {PoolID} to AutoScale({AutoScaleInterval}): '{AutoScaleFormula}'", poolId, interval, formula); + logger.LogDebug("Setting Pool {PoolID} to AutoScale({AutoScaleInterval}): '{AutoScaleFormula}'", poolId, interval, formula.Replace(Environment.NewLine, @"\n")); await batchClient.PoolOperations.EnableAutoScaleAsync(poolId, formula, interval, cancellationToken: cancellationToken); } diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 020ed0a9a..a5f1e1136 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -58,16 +58,16 @@ public BatchPool(IBatchScheduler batchScheduler, IOptions ResizeErrors { get; } = new(); private IAsyncEnumerable GetTasksAsync(string select, string filter) - => _azureProxy.ListTasksAsync(Pool.PoolId, new ODATADetailLevel { SelectClause = select, FilterClause = filter }); + => _azureProxy.ListTasksAsync(Id, new ODATADetailLevel { SelectClause = select, FilterClause = filter }); internal IAsyncEnumerable GetTasksAsync(bool includeCompleted) => GetTasksAsync("id,stateTransitionTime", includeCompleted ? default : "state ne 'completed'"); private async ValueTask RemoveNodesAsync(IList nodesToRemove, CancellationToken cancellationToken) { - _logger.LogDebug("Removing {Nodes} nodes from {PoolId}", nodesToRemove.Count, Pool.PoolId); + _logger.LogDebug("Removing {Nodes} nodes from {PoolId}", nodesToRemove.Count, Id); _resizeErrorsRetrieved = false; - await _azureProxy.DeleteBatchComputeNodesAsync(Pool.PoolId, nodesToRemove, cancellationToken); + await _azureProxy.DeleteBatchComputeNodesAsync(Id, nodesToRemove, cancellationToken); } } @@ -112,7 +112,7 @@ private void EnsureScalingModeSet(bool? autoScaleEnabled) private async ValueTask ServicePoolGetResizeErrorsAsync(CancellationToken cancellationToken) { - var currentAllocationState = await _azureProxy.GetFullAllocationStateAsync(Pool.PoolId, cancellationToken); + var currentAllocationState = await _azureProxy.GetFullAllocationStateAsync(Id, cancellationToken); EnsureScalingModeSet(currentAllocationState.AutoScaleEnabled); if (_scalingMode == ScalingMode.AutoScaleEnabled) @@ -122,7 +122,7 @@ private async ValueTask ServicePoolGetResizeErrorsAsync(CancellationToken cancel if (!_resizeErrorsRetrieved) { ResizeErrors.Clear(); - var pool = await _azureProxy.GetBatchPoolAsync(Pool.PoolId, cancellationToken, new ODATADetailLevel { SelectClause = "id,allocationStateTransitionTime,autoScaleFormula,autoScaleRun,resizeErrors" }); + var pool = await _azureProxy.GetBatchPoolAsync(Id, cancellationToken, new ODATADetailLevel { SelectClause = "id,allocationStateTransitionTime,autoScaleFormula,autoScaleRun,resizeErrors" }); var now = DateTime.UtcNow; var autoScaleRunCutoff = now - (5 * AutoScaleEvaluationInterval); // It takes some cycles to reset autoscale, so give batch some time to catch up on its own. var autoScaleTransitionCutoff = now - (10 * AutoScaleEvaluationInterval); @@ -130,7 +130,13 @@ private async ValueTask ServicePoolGetResizeErrorsAsync(CancellationToken cancel if (pool.AutoScaleRun?.Error is not null || (autoScaleRunCutoff > Creation && pool.AutoScaleRun?.Timestamp < autoScaleRunCutoff)) { _resetAutoScalingRequired |= true; - _logger.LogDebug("Resetting AutoScale for pool {PoolId} because AutoScaleRun error or timestamp is older than {AutoScaleRunCutoff}.", Pool.PoolId, autoScaleRunCutoff); + _logger.LogDebug("Resetting AutoScale for pool {PoolId} because AutoScaleRun error '{AutoScaleRunError}' or timestamp '{AutoScaleRunTimestamp}' is older than {AutoScaleRunCutoff}.", Id, pool.AutoScaleRun?.Error?.Code ?? "n/a", pool.AutoScaleRun?.Timestamp.ToUniversalTime().ToString("O") ?? "n/a", autoScaleRunCutoff.ToUniversalTime().ToString("O")); + + if (pool.AutoScaleRun?.Error is not null) + { + _logger.LogDebug("AutoScale({PoolId}) Error '{AutoScaleRunErrorMessage}': Details: {AutoScaleRunErrorValues} .", Id, pool.AutoScaleRun?.Error?.Message?? "n/a", + string.Join(", ", (pool.AutoScaleRun?.Error?.Values ?? Enumerable.Empty()).Select(pair => $"'{pair.Name}': '{pair.Value}'"))); + } } else { @@ -152,20 +158,20 @@ private async ValueTask ServicePoolGetResizeErrorsAsync(CancellationToken cancel (await _azureProxy.EvaluateAutoScaleAsync(pool.Id, pool.AutoScaleFormula, cancellationToken: cancellationToken))?.Error is not null) { _resetAutoScalingRequired |= true; - _logger.LogDebug("Resetting AutoScale for pool {PoolId} because AllocationStateTransitionTime timestamp is older than {AllocationStateTransitionTimeCutoff} and formula evaluation fails.", Pool.PoolId, autoScaleTransitionCutoff); + _logger.LogDebug("Resetting AutoScale for pool {PoolId} because AllocationStateTransitionTime timestamp is older than {AllocationStateTransitionTimeCutoff} and formula evaluation fails.", Id, autoScaleTransitionCutoff); } break; // Errors to force autoscale to be reset case PoolResizeErrorCodes.ResizeStopped: _resetAutoScalingRequired |= true; - _logger.LogDebug("Resetting AutoScale for pool {PoolId} because pool resize was stopped.", Pool.PoolId); + _logger.LogDebug("Resetting AutoScale for pool {PoolId} because pool resize was stopped.", Id); break; // Errors to both force resetting autoscale and fail tasks case PoolResizeErrorCodes.AllocationFailed: _resetAutoScalingRequired |= true; - _logger.LogDebug("Resetting AutoScale for pool {PoolId} because pool allocation failed.", Pool.PoolId); + _logger.LogDebug("Resetting AutoScale for pool {PoolId} because pool allocation failed.", Id); goto default; // Errors to fail tasks should be directed here @@ -240,7 +246,7 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc { // This method implememts a state machine to disable/enable autoscaling as needed to clear certain conditions that can be observed - var (allocationState, autoScaleEnabled, _, _, _, _) = await _azureProxy.GetFullAllocationStateAsync(Pool.PoolId, cancellationToken); + var (allocationState, autoScaleEnabled, _, _, _, _) = await _azureProxy.GetFullAllocationStateAsync(Id, cancellationToken); EnsureScalingModeSet(autoScaleEnabled); if (allocationState == AllocationState.Steady) @@ -250,8 +256,8 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc case ScalingMode.AutoScaleEnabled: if (_resetAutoScalingRequired || await GetNodesToRemove(false).AnyAsync(cancellationToken)) { - _logger.LogInformation(@"Switching pool {PoolId} to manual scale to clear resize errors and/or compute nodes in invalid states.", Pool.PoolId); - await _azureProxy.DisableBatchPoolAutoScaleAsync(Pool.PoolId, cancellationToken); + _logger.LogInformation(@"Switching pool {PoolId} to manual scale to clear resize errors and/or compute nodes in invalid states.", Id); + await _azureProxy.DisableBatchPoolAutoScaleAsync(Id, cancellationToken); _scalingMode = ScalingMode.SettingManualScale; } break; @@ -307,8 +313,8 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc _scalingMode = ScalingMode.RemovingFailedNodes; ResizeErrors.Clear(); _resizeErrorsRetrieved = true; - _logger.LogInformation(@"Switching pool {PoolId} back to autoscale.", Pool.PoolId); - await _azureProxy.EnableBatchPoolAutoScaleAsync(Pool.PoolId, !IsDedicated, AutoScaleEvaluationInterval, (p, t) => AutoPoolFormula(p, GetTaskCount(t)), cancellationToken); + _logger.LogInformation(@"Switching pool {PoolId} back to autoscale.", Id); + await _azureProxy.EnableBatchPoolAutoScaleAsync(Id, !IsDedicated, AutoScaleEvaluationInterval, (p, t) => AutoPoolFormula(p, GetTaskCount(t)), cancellationToken); _autoScaleWaitTime = DateTime.UtcNow + (3 * AutoScaleEvaluationInterval) + BatchPoolService.RunInterval; _scalingMode = _resetAutoScalingRequired ? ScalingMode.WaitingForAutoScale : ScalingMode.SettingAutoScale; break; @@ -323,7 +329,7 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc case ScalingMode.SettingAutoScale: _scalingMode = ScalingMode.AutoScaleEnabled; - _logger.LogInformation(@"Pool {PoolId} is back to normal resize and monitoring status.", Pool.PoolId); + _logger.LogInformation(@"Pool {PoolId} is back to normal resize and monitoring status.", Id); break; } @@ -341,7 +347,7 @@ int GetTaskCount(int @default) // Used to make reenabling auto-scale more perfor } IAsyncEnumerable GetNodesToRemove(bool withState) - => _azureProxy.ListComputeNodesAsync(Pool.PoolId, new ODATADetailLevel(filterClause: @"state eq 'starttaskfailed' or state eq 'preempted' or state eq 'unusable'", selectClause: withState ? @"id,state,startTaskInfo" : @"id")); + => _azureProxy.ListComputeNodesAsync(Id, new ODATADetailLevel(filterClause: @"state eq 'starttaskfailed' or state eq 'preempted' or state eq 'unusable'", selectClause: withState ? @"id,state,startTaskInfo" : @"id")); } private bool DetermineIsAvailable(DateTime? creation) @@ -361,7 +367,7 @@ private async ValueTask ServicePoolRemovePoolIfEmptyAsync(CancellationToken canc { if (!IsAvailable) { - var (_, _, _, lowPriorityNodes, _, dedicatedNodes) = await _azureProxy.GetFullAllocationStateAsync(Pool.PoolId, cancellationToken); + var (_, _, _, lowPriorityNodes, _, dedicatedNodes) = await _azureProxy.GetFullAllocationStateAsync(Id, cancellationToken); if (lowPriorityNodes.GetValueOrDefault(0) == 0 && dedicatedNodes.GetValueOrDefault(0) == 0 && !await GetTasksAsync(includeCompleted: true).AnyAsync(cancellationToken)) { _ = _batchPools.RemovePoolFromList(this); @@ -408,7 +414,7 @@ internal enum ServiceKind public bool IsAvailable { get; private set; } = true; /// - public PoolInformation Pool { get; private set; } + public string Id { get; private set; } /// public async ValueTask CanBeDeleted(CancellationToken cancellationToken = default) @@ -418,7 +424,7 @@ public async ValueTask CanBeDeleted(CancellationToken cancellationToken = return false; } - await foreach (var node in _azureProxy.ListComputeNodesAsync(Pool.PoolId, new ODATADetailLevel(selectClause: "state")).WithCancellation(cancellationToken)) + await foreach (var node in _azureProxy.ListComputeNodesAsync(Id, new ODATADetailLevel(selectClause: "state")).WithCancellation(cancellationToken)) { switch (node.State) { @@ -570,7 +576,7 @@ private async ValueTask RemoveMissingPoolsAsync(Exception ex, Cancellation if (batchException.RequestInformation.BatchError.Code == BatchErrorCodeStrings.PoolNotFound || batchException.RequestInformation.BatchError.Code == BatchErrorCodeStrings.JobNotFound) { - _logger.LogError(ex, "Batch pool and/or job {PoolId} is missing. Removing them from TES's active pool list.", Pool.PoolId); + _logger.LogError(ex, "Batch pool and/or job {PoolId} is missing. Removing them from TES's active pool list.", Id); _ = _batchPools.RemovePoolFromList(this); await _batchPools.DeletePoolAsync(this, cancellationToken); return false; @@ -582,7 +588,7 @@ private async ValueTask RemoveMissingPoolsAsync(Exception ex, Cancellation /// public async ValueTask GetAllocationStateTransitionTime(CancellationToken cancellationToken = default) - => (await _azureProxy.GetBatchPoolAsync(Pool.PoolId, cancellationToken, new ODATADetailLevel { SelectClause = "allocationStateTransitionTime" })).AllocationStateTransitionTime ?? DateTime.UtcNow; + => (await _azureProxy.GetBatchPoolAsync(Id, cancellationToken, new ODATADetailLevel { SelectClause = "allocationStateTransitionTime" })).AllocationStateTransitionTime ?? DateTime.UtcNow; /// public async ValueTask CreatePoolAndJobAsync(Microsoft.Azure.Management.Batch.Models.Pool poolModel, bool isPreemptible, CancellationToken cancellationToken) @@ -591,12 +597,8 @@ public async ValueTask CreatePoolAndJobAsync(Microsoft.Azure.Management.Batch.Mo { CloudPool pool = default; await Task.WhenAll( - _azureProxy.CreateBatchJobAsync(new() { PoolId = poolModel.Name }, cancellationToken), - Task.Run(async () => - { - var poolInfo = await _azureProxy.CreateBatchPoolAsync(poolModel, isPreemptible, cancellationToken); - pool = await _azureProxy.GetBatchPoolAsync(poolInfo.PoolId, cancellationToken, new ODATADetailLevel { SelectClause = CloudPoolSelectClause }); - }, cancellationToken)); + _azureProxy.CreateBatchJobAsync(poolModel.Name, cancellationToken), + Task.Run(async () => pool = await _azureProxy.CreateBatchPoolAsync(poolModel, isPreemptible, cancellationToken), cancellationToken)); Configure(pool); } @@ -620,7 +622,7 @@ await Task.WhenAll( Exception HandleException(Exception ex) { // When the batch management API creating the pool times out, it may or may not have created the pool. Add an inactive record to delete it if it did get created and try again later. That record will be removed later whether or not the pool was created. - Pool ??= new() { PoolId = poolModel.Name }; + Id ??= poolModel.Name; _ = _batchPools.AddPool(this); return ex switch { @@ -666,7 +668,7 @@ private void Configure(CloudPool pool) { ArgumentNullException.ThrowIfNull(pool); - Pool = new() { PoolId = pool.Id }; + Id = pool.Id; IsAvailable = DetermineIsAvailable(pool.CreationTime); if (IsAvailable) @@ -686,8 +688,8 @@ public sealed partial class BatchPool { internal int TestPendingReservationsCount => GetTasksAsync(includeCompleted: false).CountAsync().AsTask().Result; - internal int? TestTargetDedicated => _azureProxy.GetFullAllocationStateAsync(Pool.PoolId, CancellationToken.None).Result.TargetDedicated; - internal int? TestTargetLowPriority => _azureProxy.GetFullAllocationStateAsync(Pool.PoolId, CancellationToken.None).Result.TargetLowPriority; + internal int? TestTargetDedicated => _azureProxy.GetFullAllocationStateAsync(Id, CancellationToken.None).Result.TargetDedicated; + internal int? TestTargetLowPriority => _azureProxy.GetFullAllocationStateAsync(Id, CancellationToken.None).Result.TargetLowPriority; internal TimeSpan TestRotatePoolTime => _forcePoolRotationAge; diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index de2e65878..7e389e798 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -77,7 +77,7 @@ private async ValueTask ExecuteActionOnPoolsAsync(string pollName, Func ModelPoolFactory(string poolId, CancellationToken cancellationToken); - private (string PoolKey, string DisplayName) GetPoolKey(TesTask tesTask, VirtualMachineInformation virtualMachineInformation, ContainerConfiguration containerConfiguration, CancellationToken cancellationToken) + private (string PoolKey, string DisplayName) GetPoolKey(TesTask tesTask, VirtualMachineInformation virtualMachineInformation, BatchModels.ContainerConfiguration containerConfiguration, CancellationToken cancellationToken) { var identityResourceId = tesTask.Resources?.ContainsBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity) == true ? tesTask.Resources?.GetBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity) : default; var executorImage = tesTask.Executors.First().Image; @@ -183,7 +183,7 @@ public async ValueTask FlushPoolsAsync(IEnumerable assignedPools, Cancel .ToAsyncEnumerable() .WhereAwait(async p => await p.CanBeDeleted(cancellationToken)) .ToListAsync(cancellationToken)) - .Where(p => !assignedPools.Contains(p.Pool.PoolId)) + .Where(p => !assignedPools.Contains(p.Id)) .OrderBy(p => p.GetAllocationStateTransitionTime(cancellationToken)) .Take(neededPools.Count) .ToList(); @@ -204,11 +204,11 @@ public async ValueTask FlushPoolsAsync(IEnumerable assignedPools, Cancel public Task DeletePoolAsync(IBatchPool pool, CancellationToken cancellationToken) { // TODO: Consider moving any remaining tasks to another pool, or failing tasks explicitly - logger.LogDebug(@"Deleting pool and job {PoolId}", pool.Pool.PoolId); + logger.LogDebug(@"Deleting pool and job {PoolId}", pool.Id); return Task.WhenAll( - AllowIfNotFound(azureProxy.DeleteBatchPoolAsync(pool.Pool.PoolId, cancellationToken)), - AllowIfNotFound(azureProxy.DeleteBatchJobAsync(pool.Pool, cancellationToken))); + AllowIfNotFound(azureProxy.DeleteBatchPoolAsync(pool.Id, cancellationToken)), + AllowIfNotFound(azureProxy.DeleteBatchJobAsync(pool.Id, cancellationToken))); static async Task AllowIfNotFound(Task task) { @@ -228,10 +228,10 @@ private static string GetKeyFromPoolId(string poolId) private class BatchPoolEqualityComparer : IEqualityComparer { bool IEqualityComparer.Equals(IBatchPool x, IBatchPool y) - => x.Pool.PoolId?.Equals(y.Pool.PoolId) ?? false; + => x.Id?.Equals(y.Id) ?? false; int IEqualityComparer.GetHashCode(IBatchPool obj) - => obj.Pool.PoolId?.GetHashCode() ?? 0; + => obj.Id?.GetHashCode() ?? 0; } #region Used for unit/module testing @@ -249,13 +249,13 @@ protected override string GetKeyForItem(PoolSet item) => item.Key; private static string GetKeyForItem(IBatchPool pool) - => pool is null ? default : GetKeyFromPoolId(pool.Pool.PoolId); + => pool is null ? default : GetKeyFromPoolId(pool.Id); public IEnumerable GetAllPools() => this.SelectMany(s => s); public IBatchPool GetPoolOrDefault(string poolId) - => TryGetValue(GetKeyFromPoolId(poolId), out var poolSet) ? poolSet.FirstOrDefault(p => p.Pool.PoolId.Equals(poolId, StringComparison.OrdinalIgnoreCase)) : default; + => TryGetValue(GetKeyFromPoolId(poolId), out var poolSet) ? poolSet.FirstOrDefault(p => p.Id.Equals(poolId, StringComparison.OrdinalIgnoreCase)) : default; public bool Add(IBatchPool pool) { diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index f258b7eed..3a0f610a5 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -459,7 +459,7 @@ public string GetTesTaskIdFromCloudTaskId(string cloudTaskId) /// public async IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { - var tasksMetadataByPoolKey = new Dictionary>(); + var tasksMetadataByPoolKey = new Dictionary>(); var poolKeyByTaskIds = new Dictionary(); var tasks = tesTasks.ToList(); @@ -540,7 +540,7 @@ public string GetTesTaskIdFromCloudTaskId(string cloudTaskId) try { - PoolInformation poolInformation = null; + string poolId = null; var tesTaskLog = tesTask.AddTesTaskLog(); tesTaskLog.VirtualMachineInfo = virtualMachineInfo; var identities = new List(); @@ -555,36 +555,32 @@ public string GetTesTaskIdFromCloudTaskId(string cloudTaskId) identities.Add(tesTask.Resources?.GetBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity)); } - var useGen2 = virtualMachineInfo.HyperVGenerations?.Contains("V2"); - poolInformation = (await GetOrAddPoolAsync( + poolId = (await GetOrAddPoolAsync( key: poolKey, isPreemptable: virtualMachineInfo.LowPriority, - modelPoolFactory: async (id, ct) => ConvertPoolSpecificationToModelsPool( + modelPoolFactory: async (id, ct) => await GetPoolSpecification( name: id, displayName: displayName, poolIdentity: GetBatchPoolIdentity(identities.ToArray()), - pool: await GetPoolSpecification( - vmSize: virtualMachineInfo.VmSize, - autoscaled: true, - preemptable: virtualMachineInfo.LowPriority, - neededPoolNodesByPoolKey[poolKey], - nodeInfo: useGen2.GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo, - containerConfiguration: containerMetadata.ContainerConfiguration, - encryptionAtHostSupported: virtualMachineInfo.EncryptionAtHostSupported, - cancellationToken: ct)), - cancellationToken: cancellationToken) - ).Pool; + vmSize: virtualMachineInfo.VmSize, + autoscaled: true, + preemptable: virtualMachineInfo.LowPriority, + initialTarget: neededPoolNodesByPoolKey[poolKey], + nodeInfo: (virtualMachineInfo.HyperVGenerations?.Contains("V2")).GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo, + containerConfiguration: containerMetadata.ContainerConfiguration, + encryptionAtHostSupported: virtualMachineInfo.EncryptionAtHostSupported, + cancellationToken: ct), + cancellationToken: cancellationToken)).Id; var cloudTaskId = $"{tesTask.Id}-{tesTask.Logs.Count}"; - tesTask.PoolId = poolInformation.PoolId; + tesTask.PoolId = poolId; var cloudTask = await ConvertTesTaskToBatchTaskUsingRunnerAsync(cloudTaskId, tesTask, cancellationToken); logger.LogInformation(@"Creating batch task for TES task {TesTaskId}. Using VM size {VmSize}.", tesTask.Id, virtualMachineInfo.VmSize); - await azureProxy.AddBatchTaskAsync(tesTask.Id, cloudTask, poolInformation, cancellationToken); + await azureProxy.AddBatchTaskAsync(tesTask.Id, cloudTask, poolId, cancellationToken); tesTaskLog.StartTime = DateTimeOffset.UtcNow; tesTask.State = TesState.INITIALIZINGEnum; - poolInformation = null; } catch (AggregateException aggregateException) { @@ -909,7 +905,7 @@ public async Task CreateOutputFileDestinationInTesInterna /// A for controlling the lifetime of the asynchronous operation. /// /// This method also mitigates errors associated with docker daemons that are not configured to place their filesystem assets on the data drive. - private async Task StartTaskIfNeeded(VirtualMachineConfiguration machineConfiguration, CancellationToken cancellationToken) + private async Task StartTaskIfNeeded(BatchModels.VirtualMachineConfiguration machineConfiguration, CancellationToken cancellationToken) { var globalStartTaskConfigured = !string.IsNullOrWhiteSpace(globalStartTaskPath); @@ -934,7 +930,7 @@ private async Task StartTaskIfNeeded(VirtualMachineConfiguration mach var dockerConfigured = machineConfiguration.ImageReference.Publisher.Equals("microsoft-azure-batch", StringComparison.InvariantCultureIgnoreCase) && (machineConfiguration.ImageReference.Offer.StartsWith("ubuntu-server-container", StringComparison.InvariantCultureIgnoreCase) || machineConfiguration.ImageReference.Offer.StartsWith("centos-container", StringComparison.InvariantCultureIgnoreCase)); - if (!dockerConfigured) + var dockerConfigCmdLine = new Func(() => { var commandLine = new StringBuilder(); commandLine.Append(@"/usr/bin/bash -c 'trap ""echo Error trapped; exit 0"" ERR; sudo touch tmp2.json && (sudo cp /etc/docker/daemon.json tmp1.json || sudo echo {} > tmp1.json) && sudo chmod a+w tmp?.json && if fgrep ""$(dirname ""$(dirname ""$AZ_BATCH_NODE_ROOT_DIR"")"")/docker"" tmp1.json; then echo grep ""found docker path""; elif [ $? -eq 1 ]; then "); @@ -948,32 +944,32 @@ var s when s.StartsWith("batch.node.centos ") => "sudo yum install epel-release commandLine.Append(@" && jq \.\[\""data-root\""\]=\""""$(dirname ""$(dirname ""$AZ_BATCH_NODE_ROOT_DIR"")"")/docker""\"" tmp1.json >> tmp2.json && sudo cp tmp2.json /etc/docker/daemon.json && sudo chmod 644 /etc/docker/daemon.json && sudo systemctl restart docker && echo ""updated docker data-root""; else (echo ""grep failed"" || exit 1); fi'"); - var startTask = new StartTask - { - CommandLine = commandLine.ToString(), - UserIdentity = new UserIdentity(new AutoUserSpecification(elevationLevel: ElevationLevel.Admin, scope: AutoUserScope.Pool)), - }; + return commandLine.ToString(); + }); - if (globalStartTaskConfigured) - { - startTask.CommandLine = $"({startTask.CommandLine} && {CreateWgetDownloadCommand(startTaskSasUrl, StartTaskScriptFilename, setExecutable: true)}) && ./{StartTaskScriptFilename}"; - } + // Note that this has an embedded ')'. That is to faciliate merging with dockerConfigCmdLine. + var globalStartTaskCmdLine = new Func(() => $"{CreateWgetDownloadCommand(startTaskSasUrl, StartTaskScriptFilename, setExecutable: true)}) && ./{StartTaskScriptFilename}"); - return startTask; - } - else if (globalStartTaskConfigured) + BatchModels.StartTask startTask = new() { - return new StartTask + UserIdentity = new BatchModels.UserIdentity(autoUser: new BatchModels.AutoUserSpecification(elevationLevel: BatchModels.ElevationLevel.Admin, scope: BatchModels.AutoUserScope.Pool)), + CommandLine = (!dockerConfigured, globalStartTaskConfigured) switch { - CommandLine = $"./{StartTaskScriptFilename}", - UserIdentity = new UserIdentity(new AutoUserSpecification(elevationLevel: ElevationLevel.Admin, scope: AutoUserScope.Pool)), - ResourceFiles = new List { ResourceFile.FromUrl(startTaskSasUrl, StartTaskScriptFilename) } - }; - } - else - { - return default; - } + // Both start tasks are required. Note that dockerConfigCmdLine must be prefixed with an '(' which is closed inside of globalStartTaskCmdLine. + (true, true) => $"({dockerConfigCmdLine()} && {globalStartTaskCmdLine()}", + + // Only globalStartTaskCmdLine is required. Note that it contains an embedded ')' so the shell starting '(' must be provided. + (false, true) => $"({globalStartTaskCmdLine()}", + + // Only dockerConfigCmdLine is required. No additional subshell is needed. + (true, false) => dockerConfigCmdLine(), + + // No start task is needed. + _ => string.Empty, + }, + }; + + return string.IsNullOrWhiteSpace(startTask.CommandLine) ? default : startTask; } /// @@ -982,7 +978,7 @@ var s when s.StartsWith("batch.node.centos ") => "sudo yum install epel-release /// The to schedule on Azure Batch /// A for controlling the lifetime of the asynchronous operation. /// - private async ValueTask<(ContainerConfiguration ContainerConfiguration, (bool ExecutorImage, bool DockerInDockerImage, bool CromwellDrsImage) IsPublic)> GetContainerConfigurationIfNeededAsync(TesTask tesTask, CancellationToken cancellationToken) + private async ValueTask<(BatchModels.ContainerConfiguration ContainerConfiguration, (bool ExecutorImage, bool DockerInDockerImage, bool CromwellDrsImage) IsPublic)> GetContainerConfigurationIfNeededAsync(TesTask tesTask, CancellationToken cancellationToken) { var drsImageNeeded = tesTask.Inputs?.Any(i => i?.Url?.StartsWith("drs://") ?? false) ?? false; // TODO: Support for multiple executors. Cromwell has single executor per task. @@ -1049,7 +1045,7 @@ var s when s.StartsWith("batch.node.centos ") => "sudo yum install epel-release ContainerImageNames = result.ContainerImageNames, ContainerRegistries = result .ContainerRegistries - .Select(r => new ContainerRegistry( + .Select(r => new BatchModels.ContainerRegistry( userName: r.UserName, password: r.Password, registryServer: r.RegistryServer, @@ -1067,8 +1063,11 @@ private static BatchModels.BatchPoolIdentity GetBatchPoolIdentity(string[] ident => identities is null || !identities.Any() ? null : new(BatchModels.PoolIdentityType.UserAssigned, identities.ToDictionary(identity => identity, _ => new BatchModels.UserAssignedIdentities())); /// - /// Generate the PoolSpecification for the needed pool. + /// Generate the for the needed pool. /// + /// + /// + /// /// /// /// @@ -1077,18 +1076,21 @@ private static BatchModels.BatchPoolIdentity GetBatchPoolIdentity(string[] ident /// /// VM supports encryption at host. /// A for controlling the lifetime of the asynchronous operation. - /// - /// We use the PoolSpecification for both the namespace of all the constituent parts and for the fact that it allows us to configure shared and autopools using the same code. - private async ValueTask GetPoolSpecification(string vmSize, bool autoscaled, bool preemptable, int initialTarget, BatchNodeInfo nodeInfo, ContainerConfiguration containerConfiguration, bool encryptionAtHostSupported, CancellationToken cancellationToken) + /// The specification for the pool. + /// + /// Devs: Any changes to any properties set in this method will require corresponding changes to all classes implementing along with possibly any systems they call, with the likely exception of . + /// + private async ValueTask GetPoolSpecification(string name, string displayName, BatchModels.BatchPoolIdentity poolIdentity, string vmSize, bool autoscaled, bool preemptable, int initialTarget, BatchNodeInfo nodeInfo, BatchModels.ContainerConfiguration containerConfiguration, bool encryptionAtHostSupported, CancellationToken cancellationToken) { - // Any changes to any properties set in this method will require corresponding changes to ConvertPoolSpecificationToModelsPool() - - var vmConfig = new VirtualMachineConfiguration( - imageReference: new ImageReference( - nodeInfo.BatchImageOffer, - nodeInfo.BatchImagePublisher, - nodeInfo.BatchImageSku, - nodeInfo.BatchImageVersion), + ValidateString(name, nameof(name), 64); + ValidateString(displayName, nameof(displayName), 1024); + + var vmConfig = new BatchModels.VirtualMachineConfiguration( + imageReference: new BatchModels.ImageReference( + publisher: nodeInfo.BatchImagePublisher, + offer: nodeInfo.BatchImageOffer, + sku: nodeInfo.BatchImageSku, + version: nodeInfo.BatchImageVersion), nodeAgentSkuId: nodeInfo.BatchNodeAgentSkuId) { ContainerConfiguration = containerConfiguration @@ -1096,154 +1098,52 @@ private async ValueTask GetPoolSpecification(string vmSize, b if (encryptionAtHostSupported) { - vmConfig.DiskEncryptionConfiguration = new DiskEncryptionConfiguration( - targets: new List { DiskEncryptionTarget.OsDisk, DiskEncryptionTarget.TemporaryDisk } + vmConfig.DiskEncryptionConfiguration = new BatchModels.DiskEncryptionConfiguration( + targets: new List { BatchModels.DiskEncryptionTarget.OsDisk, BatchModels.DiskEncryptionTarget.TemporaryDisk } ); } - var poolSpecification = new PoolSpecification - { - VirtualMachineConfiguration = vmConfig, - VirtualMachineSize = vmSize, - ResizeTimeout = TimeSpan.FromMinutes(30), - StartTask = await StartTaskIfNeeded(vmConfig, cancellationToken), - TargetNodeCommunicationMode = NodeCommunicationMode.Simplified, - }; + BatchModels.ScaleSettings scaleSettings = new(); if (autoscaled) { - poolSpecification.AutoScaleEnabled = true; - poolSpecification.AutoScaleEvaluationInterval = BatchPool.AutoScaleEvaluationInterval; - poolSpecification.AutoScaleFormula = BatchPool.AutoPoolFormula(preemptable, initialTarget); + scaleSettings.AutoScale = new(BatchPool.AutoPoolFormula(preemptable, initialTarget), BatchPool.AutoScaleEvaluationInterval); } else { - poolSpecification.AutoScaleEnabled = false; - poolSpecification.TargetLowPriorityComputeNodes = preemptable == true ? initialTarget : 0; - poolSpecification.TargetDedicatedComputeNodes = preemptable == false ? initialTarget : 0; + scaleSettings.FixedScale = new( + resizeTimeout: TimeSpan.FromMinutes(30), + targetDedicatedNodes: preemptable == false ? initialTarget : 0, + targetLowPriorityNodes: preemptable == true ? initialTarget : 0, + nodeDeallocationOption: BatchModels.ComputeNodeDeallocationOption.TaskCompletion); } + BatchModels.Pool poolSpec = new(name: name, displayName: displayName, identity: poolIdentity) + { + VmSize = vmSize, + ScaleSettings = scaleSettings, + DeploymentConfiguration = new(virtualMachineConfiguration: vmConfig), + //ApplicationPackages = , + StartTask = await StartTaskIfNeeded(vmConfig, cancellationToken), + TargetNodeCommunicationMode = BatchModels.NodeCommunicationMode.Simplified, + }; + if (!string.IsNullOrEmpty(batchNodesSubnetId)) { - poolSpecification.NetworkConfiguration = new() + poolSpec.NetworkConfiguration = new() { - PublicIPAddressConfiguration = new PublicIPAddressConfiguration(disableBatchNodesPublicIpAddress ? IPAddressProvisioningType.NoPublicIPAddresses : IPAddressProvisioningType.BatchManaged), + PublicIPAddressConfiguration = new BatchModels.PublicIPAddressConfiguration(disableBatchNodesPublicIpAddress ? BatchModels.IPAddressProvisioningType.NoPublicIPAddresses : BatchModels.IPAddressProvisioningType.BatchManaged), SubnetId = batchNodesSubnetId }; } - return poolSpecification; - } + return poolSpec; - /// - /// Convert PoolSpecification to Models.Pool, including any BatchPoolIdentity - /// - /// - /// Note: this is not a complete conversion. It only converts properties we are currently using (including referenced objects).
- /// Devs: Any changes to any properties set in this method will require corresponding changes to all classes implementing along with possibly any systems they call, with the possible exception of . - ///
- /// - /// - /// - /// - /// A . - private static BatchModels.Pool ConvertPoolSpecificationToModelsPool(string name, string displayName, BatchModels.BatchPoolIdentity poolIdentity, PoolSpecification pool) - { - // Don't add feature work here that isn't necesitated by a change to GetPoolSpecification() unless it's a feature that PoolSpecification does not support. - // TODO: (perpetually) add new properties we set in the future on and/or its contained objects, if possible. When not, update CreateAutoPoolModePoolInformation(). - - ValidateString(name, nameof(name), 64); - ValidateString(displayName, nameof(displayName), 1024); - - return new(name: name, displayName: displayName, identity: poolIdentity) - { - VmSize = pool.VirtualMachineSize, - ScaleSettings = true == pool.AutoScaleEnabled ? ConvertAutoScale(pool) : ConvertManualScale(pool), - DeploymentConfiguration = new(virtualMachineConfiguration: ConvertVirtualMachineConfiguration(pool.VirtualMachineConfiguration)), - ApplicationPackages = pool.ApplicationPackageReferences?.Select(ConvertApplicationPackage).ToList(), - NetworkConfiguration = ConvertNetworkConfiguration(pool.NetworkConfiguration), - StartTask = ConvertStartTask(pool.StartTask), - TargetNodeCommunicationMode = ConvertNodeCommunicationMode(pool.TargetNodeCommunicationMode), - }; - - static void ValidateString(string value, string name, int length) + static void ValidateString(string value, string paramName, int maxLength) { - ArgumentNullException.ThrowIfNull(value, name); - if (value.Length > length) throw new ArgumentException($"{name} exceeds maximum length {length}", name); + ArgumentNullException.ThrowIfNull(value, paramName); + if (value.Length > maxLength) throw new ArgumentException($"{paramName} exceeds maximum length {maxLength}", paramName); } - - static BatchModels.ScaleSettings ConvertManualScale(PoolSpecification pool) - => new() - { - FixedScale = new() - { - TargetDedicatedNodes = pool.TargetDedicatedComputeNodes, - TargetLowPriorityNodes = pool.TargetLowPriorityComputeNodes, - ResizeTimeout = pool.ResizeTimeout, - NodeDeallocationOption = BatchModels.ComputeNodeDeallocationOption.TaskCompletion - } - }; - - static BatchModels.ScaleSettings ConvertAutoScale(PoolSpecification pool) - => new() - { - AutoScale = new() - { - Formula = pool.AutoScaleFormula, - EvaluationInterval = pool.AutoScaleEvaluationInterval - } - }; - - static BatchModels.VirtualMachineConfiguration ConvertVirtualMachineConfiguration(VirtualMachineConfiguration virtualMachineConfiguration) - => virtualMachineConfiguration is null ? default : new(ConvertImageReference(virtualMachineConfiguration.ImageReference), virtualMachineConfiguration.NodeAgentSkuId, containerConfiguration: ConvertContainerConfiguration(virtualMachineConfiguration.ContainerConfiguration), diskEncryptionConfiguration: ConvertDiskEncryptionConfiguration(virtualMachineConfiguration.DiskEncryptionConfiguration)); - - static BatchModels.ContainerConfiguration ConvertContainerConfiguration(ContainerConfiguration containerConfiguration) - => containerConfiguration is null ? default : new(containerConfiguration.ContainerImageNames, containerConfiguration.ContainerRegistries?.Select(ConvertContainerRegistry).ToList()); - - static BatchModels.StartTask ConvertStartTask(StartTask startTask) - => startTask is null ? default : new(startTask.CommandLine, startTask.ResourceFiles?.Select(ConvertResourceFile).ToList(), startTask.EnvironmentSettings?.Select(ConvertEnvironmentSetting).ToList(), ConvertUserIdentity(startTask.UserIdentity), startTask.MaxTaskRetryCount, startTask.WaitForSuccess, ConvertTaskContainerSettings(startTask.ContainerSettings)); - - static BatchModels.UserIdentity ConvertUserIdentity(UserIdentity userIdentity) - => userIdentity is null ? default : new(userIdentity.UserName, ConvertAutoUserSpecification(userIdentity.AutoUser)); - - static BatchModels.AutoUserSpecification ConvertAutoUserSpecification(AutoUserSpecification autoUserSpecification) - => autoUserSpecification is null ? default : new((BatchModels.AutoUserScope?)autoUserSpecification.Scope, (BatchModels.ElevationLevel?)autoUserSpecification.ElevationLevel); - - static BatchModels.TaskContainerSettings ConvertTaskContainerSettings(TaskContainerSettings containerSettings) - => containerSettings is null ? default : new(containerSettings.ImageName, containerSettings.ContainerRunOptions, ConvertContainerRegistry(containerSettings.Registry), (BatchModels.ContainerWorkingDirectory?)containerSettings.WorkingDirectory); - - static BatchModels.ContainerRegistry ConvertContainerRegistry(ContainerRegistry containerRegistry) - => containerRegistry is null ? default : new(containerRegistry.UserName, containerRegistry.Password, containerRegistry.RegistryServer, ConvertComputeNodeIdentityReference(containerRegistry.IdentityReference)); - - static BatchModels.ResourceFile ConvertResourceFile(ResourceFile resourceFile) - => resourceFile is null ? default : new(resourceFile.AutoStorageContainerName, resourceFile.StorageContainerUrl, resourceFile.HttpUrl, resourceFile.BlobPrefix, resourceFile.FilePath, resourceFile.FileMode, ConvertComputeNodeIdentityReference(resourceFile.IdentityReference)); - - static BatchModels.ComputeNodeIdentityReference ConvertComputeNodeIdentityReference(ComputeNodeIdentityReference computeNodeIdentityReference) - => computeNodeIdentityReference is null ? default : new(computeNodeIdentityReference.ResourceId); - - static BatchModels.EnvironmentSetting ConvertEnvironmentSetting(EnvironmentSetting environmentSetting) - => environmentSetting is null ? default : new(environmentSetting.Name, environmentSetting.Value); - - static BatchModels.ImageReference ConvertImageReference(ImageReference imageReference) - => imageReference is null ? default : new(imageReference.Publisher, imageReference.Offer, imageReference.Sku, imageReference.Version); - - static BatchModels.ApplicationPackageReference ConvertApplicationPackage(ApplicationPackageReference applicationPackage) - => applicationPackage is null ? default : new(applicationPackage.ApplicationId, applicationPackage.Version); - - static BatchModels.NetworkConfiguration ConvertNetworkConfiguration(NetworkConfiguration networkConfiguration) - => networkConfiguration is null ? default : new(subnetId: networkConfiguration.SubnetId, publicIPAddressConfiguration: ConvertPublicIPAddressConfiguration(networkConfiguration.PublicIPAddressConfiguration)); - - static BatchModels.PublicIPAddressConfiguration ConvertPublicIPAddressConfiguration(PublicIPAddressConfiguration publicIPAddressConfiguration) - => publicIPAddressConfiguration is null ? default : new(provision: (BatchModels.IPAddressProvisioningType?)publicIPAddressConfiguration.Provision); - - static BatchModels.NodeCommunicationMode? ConvertNodeCommunicationMode(NodeCommunicationMode? nodeCommunicationMode) - => (BatchModels.NodeCommunicationMode?)nodeCommunicationMode; - - static BatchModels.DiskEncryptionConfiguration ConvertDiskEncryptionConfiguration(DiskEncryptionConfiguration diskEncryptionConfiguration) - => diskEncryptionConfiguration is null ? default : new(diskEncryptionConfiguration.Targets.Select(x => ConvertDiskEncryptionTarget(x)).ToList()); - - static BatchModels.DiskEncryptionTarget ConvertDiskEncryptionTarget(DiskEncryptionTarget? diskEncryptionTarget) - => (BatchModels.DiskEncryptionTarget)diskEncryptionTarget; } /// diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 861263a9c..56e4c57f4 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -66,38 +66,35 @@ public CachingWithRetriesAzureProxy(IAzureProxy azureProxy, IOptions - public async Task CreateBatchJobAsync(PoolInformation poolInformation, CancellationToken cancellationToken) + public async Task CreateBatchJobAsync(string jobId, CancellationToken cancellationToken) { try { - await batchPoolOrJobCreateOrTaskAddHandler.ExecuteAsync(ct => azureProxy.CreateBatchJobAsync(poolInformation, ct), cancellationToken); + await batchPoolOrJobCreateOrTaskAddHandler.ExecuteAsync(ct => azureProxy.CreateBatchJobAsync(jobId, ct), cancellationToken); } catch (BatchException exc) when (BatchErrorCodeStrings.JobExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } } /// - public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, PoolInformation poolInformation, CancellationToken cancellationToken) + public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, string jobId, CancellationToken cancellationToken) { try { - await batchPoolOrJobCreateOrTaskAddHandler.ExecuteAsync(ct => azureProxy.AddBatchTaskAsync(tesTaskId, cloudTask, poolInformation, ct), cancellationToken); + await batchPoolOrJobCreateOrTaskAddHandler.ExecuteAsync(ct => azureProxy.AddBatchTaskAsync(tesTaskId, cloudTask, jobId, ct), cancellationToken); } catch (BatchException exc) when (BatchErrorCodeStrings.TaskExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } } /// - public Task DeleteBatchJobAsync(PoolInformation poolInformation, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(poolInformation, ct), cancellationToken); + public Task DeleteBatchJobAsync(string jobId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(jobId, ct), cancellationToken); /// - public Task DeleteBatchJobAsync(string taskId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(taskId, ct), cancellationToken); + public Task DeleteBatchTaskAsync(string tesTaskId, string jobId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(tesTaskId, jobId, ct), cancellationToken); /// - public Task DeleteBatchTaskAsync(string taskId, string jobId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(taskId, jobId, ct), cancellationToken); - - /// - public Task TerminateBatchTaskAsync(string taskId, string jobId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.TerminateBatchTaskAsync(taskId, jobId, ct), cancellationToken); + public Task TerminateBatchTaskAsync(string tesTaskId, string jobId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.TerminateBatchTaskAsync(tesTaskId, jobId, ct), cancellationToken); /// public Task DeleteBatchPoolAsync(string poolId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchPoolAsync(poolId, ct), cancellationToken); @@ -121,9 +118,6 @@ public async Task DeleteBatchComputeNodesAsync(string poolId, IEnumerable public Task BlobExistsAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.BlobExistsAsync(blobAbsoluteUri, ct), cancellationToken); - /// - public Task> GetActivePoolIdsAsync(string prefix, TimeSpan minAge, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetActivePoolIdsAsync(prefix, minAge, ct), cancellationToken); - /// public IAsyncEnumerable GetActivePoolsAsync(string hostName) => cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetActivePoolsAsync(hostName)); @@ -137,9 +131,6 @@ public async Task DeleteBatchComputeNodesAsync(string poolId, IEnumerable public int GetBatchActivePoolCount() => cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetBatchActivePoolCount()); - /// - public Task> GetPoolIdsReferencedByJobsAsync(CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(azureProxy.GetPoolIdsReferencedByJobsAsync, cancellationToken); - /// public Task GetStorageAccountKeyAsync(StorageAccountInfo storageAccountInfo, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAndCachingAsync($"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountInfo.Id}", @@ -166,12 +157,6 @@ public async Task GetStorageAccountInfoAsync(string storageA /// public Task> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListBlobsAsync(directoryUri, ct), cancellationToken); - /// - public Task> ListOldJobsToDeleteAsync(TimeSpan oldestJobAge, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListOldJobsToDeleteAsync(oldestJobAge, ct), cancellationToken); - - /// - public Task> ListOrphanedJobsToDeleteAsync(TimeSpan minJobAge, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListOrphanedJobsToDeleteAsync(minJobAge, ct), cancellationToken); - /// public Task UploadBlobAsync(Uri blobAbsoluteUri, string content, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobAsync(blobAbsoluteUri, content, ct), cancellationToken); @@ -191,7 +176,7 @@ public async Task GetStorageAccountInfoAsync(string storageA public string GetArmRegion() => azureProxy.GetArmRegion(); /// - public async Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, bool isPreemptable, CancellationToken cancellationToken) + public async Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, bool isPreemptable, CancellationToken cancellationToken) { try { @@ -199,7 +184,7 @@ public async Task CreateBatchPoolAsync(BatchModels.Pool poolInf } catch (BatchException exc) when (BatchErrorCodeStrings.PoolExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { - return new() { PoolId = poolInfo.Name }; + return await GetBatchPoolAsync(poolInfo.Name, cancellationToken, new ODATADetailLevel { SelectClause = BatchPool.CloudPoolSelectClause }); } } diff --git a/src/TesApi.Web/IAzureProxy.cs b/src/TesApi.Web/IAzureProxy.cs index 1b85636a3..137c14dd7 100644 --- a/src/TesApi.Web/IAzureProxy.cs +++ b/src/TesApi.Web/IAzureProxy.cs @@ -20,25 +20,25 @@ public interface IAzureProxy /// /// Creates a new Azure Batch job for /// - /// + /// /// A for controlling the lifetime of the asynchronous operation. - Task CreateBatchJobAsync(PoolInformation poolInformation, CancellationToken cancellationToken); + Task CreateBatchJobAsync(string jobId, CancellationToken cancellationToken); /// - /// Adds a task to the batch job paired to the ."/> + /// Adds a to the job."/> /// /// /// - /// + /// /// A for controlling the lifetime of the asynchronous operation. - Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, PoolInformation poolInformation, CancellationToken cancellationToken); + Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, string jobId, CancellationToken cancellationToken); /// /// Terminates and deletes an Azure Batch job for /// - /// + /// /// A for controlling the lifetime of the asynchronous operation. - Task DeleteBatchJobAsync(PoolInformation poolInformation, CancellationToken cancellationToken); + Task DeleteBatchJobAsync(string jobId, CancellationToken cancellationToken); /// /// Gets the for the given storage account name @@ -51,33 +51,26 @@ public interface IAzureProxy /// /// Creates an Azure Batch pool who's lifecycle must be manually managed /// - /// Contains information about the pool. becomes the + /// Contains information about the pool to be created. Note that becomes . /// True if nodes in this pool will all be preemptable. False if nodes will all be dedicated. /// A for controlling the lifetime of the asynchronous operation. - Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, bool isPreemptable, CancellationToken cancellationToken); - - /// - /// Deletes an Azure Batch job for Autopools - /// - /// The unique TES task ID - /// A for controlling the lifetime of the asynchronous operation. - Task DeleteBatchJobAsync(string taskId, CancellationToken cancellationToken); + Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, bool isPreemptable, CancellationToken cancellationToken); /// /// Terminates an Azure Batch task /// - /// The unique TES task ID + /// The unique TES task ID /// The batch job that contains the task /// A for controlling the lifetime of the asynchronous operation. - Task TerminateBatchTaskAsync(string taskId, string jobId, CancellationToken cancellationToken); + Task TerminateBatchTaskAsync(string tesTaskId, string jobId, CancellationToken cancellationToken); /// /// Deletes an Azure Batch task /// - /// The unique TES task ID + /// The unique TES task ID /// The batch job that contains the task /// A for controlling the lifetime of the asynchronous operation. - Task DeleteBatchTaskAsync(string taskId, string jobId, CancellationToken cancellationToken); + Task DeleteBatchTaskAsync(string tesTaskId, string jobId, CancellationToken cancellationToken); /// /// Gets the counts of active batch nodes, grouped by VmSize @@ -155,32 +148,6 @@ public interface IAzureProxy /// Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken); - /// - /// Gets the ids of completed Batch jobs older than specified timespan - /// - /// - /// A for controlling the lifetime of the asynchronous operation. - /// List of Batch job ids - Task> ListOldJobsToDeleteAsync(TimeSpan oldestJobAge, CancellationToken cancellationToken); - - /// - /// Gets the ids of orphaned Batch jobs older than specified timespan - /// These jobs are active for prolonged period of time, have auto pool, NoAction termination option, and no tasks - /// - /// - /// A for controlling the lifetime of the asynchronous operation. - /// List of Batch job ids - Task> ListOrphanedJobsToDeleteAsync(TimeSpan minJobAge, CancellationToken cancellationToken); - - /// - /// Gets the list of active pool ids matching the prefix and with creation time older than the minAge - /// - /// - /// - /// A for controlling the lifetime of the asynchronous operation. - /// Active pool ids - Task> GetActivePoolIdsAsync(string prefix, TimeSpan minAge, CancellationToken cancellationToken); - /// /// Gets the list of active pools matching the hostname in the metadata /// @@ -188,13 +155,6 @@ public interface IAzureProxy /// List of managed by the host. IAsyncEnumerable GetActivePoolsAsync(string hostName); - /// - /// Gets the list of pool ids referenced by the jobs - /// - /// A for controlling the lifetime of the asynchronous operation. - /// Pool ids - Task> GetPoolIdsReferencedByJobsAsync(CancellationToken cancellationToken); - /// /// Deletes the specified pool /// @@ -207,7 +167,7 @@ public interface IAzureProxy /// /// The of the pool to retrieve. /// A for controlling the lifetime of the asynchronous operation. - /// A Microsoft.Azure.Batch.DetailLevel used for controlling which properties are retrieved from the service. + /// A used for controlling which properties are retrieved from the service. /// Task GetBatchPoolAsync(string poolId, CancellationToken cancellationToken, DetailLevel detailLevel = default); @@ -216,7 +176,7 @@ public interface IAzureProxy /// /// The of the job to retrieve. /// A for controlling the lifetime of the asynchronous operation. - /// A Microsoft.Azure.Batch.DetailLevel used for controlling which properties are retrieved from the service. + /// A used for controlling which properties are retrieved from the service. /// Task GetBatchJobAsync(string jobId, CancellationToken cancellationToken, DetailLevel detailLevel = default); @@ -224,7 +184,7 @@ public interface IAzureProxy /// Lists compute nodes in batch pool /// /// The id of the pool. - /// A Microsoft.Azure.Batch.DetailLevel used for filtering the list and for controlling which properties are retrieved from the service. + /// A used for filtering the list and for controlling which properties are retrieved from the service. /// IAsyncEnumerable ListComputeNodesAsync(string poolId, DetailLevel detailLevel = null); @@ -232,7 +192,7 @@ public interface IAzureProxy /// Lists jobs in the batch account ///
/// The job id (which is the pool id) - /// A Microsoft.Azure.Batch.DetailLevel used for filtering the list and for controlling which properties are retrieved from the service. + /// A used for filtering the list and for controlling which properties are retrieved from the service. /// IAsyncEnumerable ListTasksAsync(string jobId, DetailLevel detailLevel = null); diff --git a/src/TesApi.Web/IBatchPool.cs b/src/TesApi.Web/IBatchPool.cs index f65db2545..f74536eea 100644 --- a/src/TesApi.Web/IBatchPool.cs +++ b/src/TesApi.Web/IBatchPool.cs @@ -20,9 +20,9 @@ public interface IBatchPool bool IsAvailable { get; } /// - /// Provides the for the pool. + /// Provides the for the pool. /// - PoolInformation Pool { get; } + string Id { get; } /// /// Creates an Azure Batch pool and associated job in the Batch Account. diff --git a/src/TesApi.Web/Management/Batch/ArmBatchPoolManager.cs b/src/TesApi.Web/Management/Batch/ArmBatchPoolManager.cs index 15fe452ff..52247f625 100644 --- a/src/TesApi.Web/Management/Batch/ArmBatchPoolManager.cs +++ b/src/TesApi.Web/Management/Batch/ArmBatchPoolManager.cs @@ -37,7 +37,7 @@ public ArmBatchPoolManager(AzureManagementClientsFactory azureClientsFactory, } /// - public async Task CreateBatchPoolAsync(Pool poolInfo, bool isPreemptable, CancellationToken cancellationToken) + public async Task CreateBatchPoolAsync(Pool poolInfo, bool isPreemptable, CancellationToken cancellationToken) { try { @@ -45,7 +45,7 @@ public async Task CreateBatchPoolAsync(Pool poolInfo, bool isPr var pool = await batchManagementClient.Pool.CreateAsync(azureClientsFactory.BatchAccountInformation.ResourceGroupName, azureClientsFactory.BatchAccountInformation.Name, poolInfo.Name, poolInfo, cancellationToken: cancellationToken); - return new PoolInformation() { PoolId = pool.Name }; + return pool.Name; } catch (Exception exc) { diff --git a/src/TesApi.Web/Management/Batch/IBatchPoolManager.cs b/src/TesApi.Web/Management/Batch/IBatchPoolManager.cs index 3389965c7..a5de454c3 100644 --- a/src/TesApi.Web/Management/Batch/IBatchPoolManager.cs +++ b/src/TesApi.Web/Management/Batch/IBatchPoolManager.cs @@ -18,9 +18,9 @@ public interface IBatchPoolManager /// /// Contains information about the pool. becomes the /// True if nodes in this pool will all be preemptable. False if nodes will all be dedicated. - /// that identifies the created pool. + /// that identifies the created pool. /// - Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, bool isPreemptable, CancellationToken cancellationToken); + Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, bool isPreemptable, CancellationToken cancellationToken); /// /// Deletes the specified pool diff --git a/src/TesApi.Web/Management/Batch/TerraBatchPoolManager.cs b/src/TesApi.Web/Management/Batch/TerraBatchPoolManager.cs index d6f9c1a70..c060029d6 100644 --- a/src/TesApi.Web/Management/Batch/TerraBatchPoolManager.cs +++ b/src/TesApi.Web/Management/Batch/TerraBatchPoolManager.cs @@ -70,7 +70,7 @@ public TerraBatchPoolManager(TerraWsmApiClient terraWsmApiClient, IMapper mapper /// /// A for controlling the lifetime of the asynchronous operation. /// - public async Task CreateBatchPoolAsync(Pool poolInfo, bool isPreemptable, CancellationToken cancellationToken) + public async Task CreateBatchPoolAsync(Pool poolInfo, bool isPreemptable, CancellationToken cancellationToken) { var resourceId = Guid.NewGuid(); var resourceName = $"TES-{resourceId}"; @@ -95,7 +95,7 @@ public async Task CreateBatchPoolAsync(Pool poolInfo, bool isPr var response = await terraWsmApiClient.CreateBatchPool(Guid.Parse(terraOptions.WorkspaceId), apiRequest, cancellationToken); - return new PoolInformation() { PoolId = response.AzureBatchPool.Attributes.Id }; + return response.AzureBatchPool.Attributes.Id; } private static void AddResourceIdToPoolMetadata(ApiCreateBatchPoolRequest apiRequest, Guid resourceId) From 9ad19165bcc33d9cabc1a2759284eab49dccc7f2 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 6 Oct 2023 10:44:14 -0700 Subject: [PATCH 025/202] Formatting --- src/TesApi.Tests/BatchPoolTests.cs | 18 ++++++++++-------- src/TesApi.Web/BatchPool.cs | 8 ++++++-- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/TesApi.Tests/BatchPoolTests.cs b/src/TesApi.Tests/BatchPoolTests.cs index 961023b87..ec02cfb02 100644 --- a/src/TesApi.Tests/BatchPoolTests.cs +++ b/src/TesApi.Tests/BatchPoolTests.cs @@ -252,7 +252,7 @@ static Microsoft.Azure.Batch.MetadataItem ConvertMetadata(KeyValuePair GeneratePool( + { + return GeneratePool( id: poolId, currentDedicatedNodes: poolState.CurrentDedicatedNodes, currentLowPriorityNodes: poolState.CurrentLowPriorityNodes, @@ -321,6 +322,7 @@ private static CloudPool GetPoolFromState(string poolId, PoolState poolState) autoScaleRun: poolState.AutoScaleRun, enableAutoScale: poolState.EnableAutoScale, metadata: poolState.PoolMetadata); + } } private static Action> GetMockSkuInfoProvider(AzureProxyReturnValues azureProxyReturnValues) diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index a5f1e1136..f55839117 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -130,11 +130,15 @@ private async ValueTask ServicePoolGetResizeErrorsAsync(CancellationToken cancel if (pool.AutoScaleRun?.Error is not null || (autoScaleRunCutoff > Creation && pool.AutoScaleRun?.Timestamp < autoScaleRunCutoff)) { _resetAutoScalingRequired |= true; - _logger.LogDebug("Resetting AutoScale for pool {PoolId} because AutoScaleRun error '{AutoScaleRunError}' or timestamp '{AutoScaleRunTimestamp}' is older than {AutoScaleRunCutoff}.", Id, pool.AutoScaleRun?.Error?.Code ?? "n/a", pool.AutoScaleRun?.Timestamp.ToUniversalTime().ToString("O") ?? "n/a", autoScaleRunCutoff.ToUniversalTime().ToString("O")); + _logger.LogDebug("Resetting AutoScale for pool {PoolId} because AutoScaleRun error '{AutoScaleRunError}' or timestamp '{AutoScaleRunTimestamp}' is older than {AutoScaleRunCutoff}.", + Id, + pool.AutoScaleRun?.Error?.Code ?? "n/a", + pool.AutoScaleRun?.Timestamp.ToUniversalTime().ToString("O") ?? "n/a", + autoScaleRunCutoff.ToUniversalTime().ToString("O")); if (pool.AutoScaleRun?.Error is not null) { - _logger.LogDebug("AutoScale({PoolId}) Error '{AutoScaleRunErrorMessage}': Details: {AutoScaleRunErrorValues} .", Id, pool.AutoScaleRun?.Error?.Message?? "n/a", + _logger.LogDebug("AutoScale({PoolId}) Error '{AutoScaleRunErrorMessage}': Details: {AutoScaleRunErrorValues} .", Id, pool.AutoScaleRun?.Error?.Message ?? "n/a", string.Join(", ", (pool.AutoScaleRun?.Error?.Values ?? Enumerable.Empty()).Select(pair => $"'{pair.Name}': '{pair.Value}'"))); } } From 4689af7eb5122436775d6651e24b27e2d9ec5c78 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 6 Oct 2023 20:28:43 -0700 Subject: [PATCH 026/202] Add (untested) processing of event messages --- .../TestServices/TestServiceProvider.cs | 3 + src/TesApi.Web/AzureProxy.cs | 33 ++++- src/TesApi.Web/BatchPools.BatchScheduler.cs | 2 +- src/TesApi.Web/BatchScheduler.cs | 49 ++++++- .../CachingWithRetriesAzureProxy.cs | 8 +- src/TesApi.Web/IAzureProxy.cs | 18 +++ src/TesApi.Web/IBatchScheduler.cs | 12 +- src/TesApi.Web/Scheduler.cs | 90 ++++++++++++- src/TesApi.Web/Startup.cs | 2 + src/TesApi.Web/TesApi.Web.csproj | 3 +- src/TesApi.Web/TesEventMessage.cs | 120 ++++++++++++++++++ 11 files changed, 321 insertions(+), 19 deletions(-) create mode 100644 src/TesApi.Web/TesEventMessage.cs diff --git a/src/TesApi.Tests/TestServices/TestServiceProvider.cs b/src/TesApi.Tests/TestServices/TestServiceProvider.cs index fb4f5efaf..754ca2036 100644 --- a/src/TesApi.Tests/TestServices/TestServiceProvider.cs +++ b/src/TesApi.Tests/TestServices/TestServiceProvider.cs @@ -83,11 +83,14 @@ internal TestServiceProvider( .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) + .AddTransient>(_ => NullLogger.Instance) .AddSingleton() .AddSingleton() .AddSingleton() .AddSingleton() .AddTransient() + .AddSingleton() + .AddTransient() .AddSingleton() .AddSingleton(s => GetArmBatchQuotaProvider(s, armBatchQuotaProvider)) //added so config utils gets the arm implementation, to be removed once config utils is refactored. .AddSingleton() diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index dcc343bbc..eecee4814 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -8,6 +8,7 @@ using System.Text.RegularExpressions; using System.Threading; using System.Threading.Tasks; +using Azure.Storage.Blobs; using Microsoft.Azure.Batch; using Microsoft.Azure.Batch.Auth; using Microsoft.Azure.Batch.Common; @@ -62,7 +63,7 @@ public partial class AzureProxy : IAzureProxy /// /// The logger /// - public AzureProxy(IOptions batchAccountOptions, IBatchPoolManager batchPoolManager, ILogger logger) + public AzureProxy(IOptions batchAccountOptions, IBatchPoolManager batchPoolManager, ILogger logger/*, Azure.Core.TokenCredential tokenCredential*/) { ArgumentNullException.ThrowIfNull(batchAccountOptions); ArgumentNullException.ThrowIfNull(logger); @@ -409,6 +410,36 @@ public async Task> ListBlobsAsync(Uri directoryUri, Cance return results; } + /// + public IAsyncEnumerable ListBlobsWithTagsAsync(Uri directoryUri, IDictionary tagsQuery, CancellationToken cancellationToken) + { + BlobUriBuilder builder = new(directoryUri); + var directory = builder.BlobName; + builder.BlobName = string.Empty; + BlobContainerClient container = new(builder.ToUri()); + + if (!directory.EndsWith('/')) + { + directory += "/"; + } + + return container.FindBlobsByTagsAsync($"&where=@container='{container.Name}' AND {string.Join(" AND", tagsQuery.Select(pair => $"\"{pair.Key}\"='{pair.Value}'"))}", cancellationToken) + .Where(blob => blob.BlobName.StartsWith(directory)); + } + + + /// + public async Task SetBlobTags(Uri blobAbsoluteUri, IDictionary tags, CancellationToken cancellationToken) + { + BlobClient blob = new(blobAbsoluteUri); + using var result = await blob.SetTagsAsync(tags, cancellationToken: cancellationToken); + + if (result.IsError) + { + // throw something here. + } + } + /// public bool LocalFileExists(string path) => File.Exists(path); diff --git a/src/TesApi.Web/BatchPools.BatchScheduler.cs b/src/TesApi.Web/BatchPools.BatchScheduler.cs index 8dc1c1e70..3cdf55b75 100644 --- a/src/TesApi.Web/BatchPools.BatchScheduler.cs +++ b/src/TesApi.Web/BatchPools.BatchScheduler.cs @@ -153,7 +153,7 @@ internal async Task GetOrAddPoolAsync(string key, bool isPreemptable modelPool.Metadata ??= new List(); modelPool.Metadata.Add(new(PoolHostName, this.batchPrefix)); modelPool.Metadata.Add(new(PoolIsDedicated, (!isPreemptable).ToString())); - var batchPool = _batchPoolFactory.CreateNew(); + var batchPool = batchPoolFactory.CreateNew(); await batchPool.CreatePoolAndJobAsync(modelPool, isPreemptable, cancellationToken); pool = batchPool; } diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 3a0f610a5..00a6559e7 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -19,7 +19,6 @@ using TesApi.Web.Extensions; using TesApi.Web.Management; using TesApi.Web.Management.Models.Quotas; -using TesApi.Web.Options; using TesApi.Web.Runner; using TesApi.Web.Storage; using BatchModels = Microsoft.Azure.Management.Batch.Models; @@ -74,7 +73,8 @@ public partial class BatchScheduler : IBatchScheduler private readonly string globalManagedIdentity; private readonly ContainerRegistryProvider containerRegistryProvider; private readonly string batchPrefix; - private readonly IBatchPoolFactory _batchPoolFactory; + private readonly IBatchPoolFactory batchPoolFactory; + private readonly BatchTesEventMessageFactory batchTesEventMessageFactory; private readonly IAllowedVmSizesService allowedVmSizesService; private readonly TaskExecutionScriptingManager taskExecutionScriptingManager; @@ -96,7 +96,8 @@ public partial class BatchScheduler : IBatchScheduler /// Container registry information /// Batch pool factory /// Service to get allowed vm sizes. - /// + /// + /// /param> public BatchScheduler( ILogger logger, IOptions batchGen1Options, @@ -113,7 +114,8 @@ public BatchScheduler( ContainerRegistryProvider containerRegistryProvider, IBatchPoolFactory poolFactory, IAllowedVmSizesService allowedVmSizesService, - TaskExecutionScriptingManager taskExecutionScriptingManager) + TaskExecutionScriptingManager taskExecutionScriptingManager, + BatchTesEventMessageFactory batchTesEventMessageFactory) { ArgumentNullException.ThrowIfNull(logger); ArgumentNullException.ThrowIfNull(azureProxy); @@ -123,6 +125,7 @@ public BatchScheduler( ArgumentNullException.ThrowIfNull(containerRegistryProvider); ArgumentNullException.ThrowIfNull(poolFactory); ArgumentNullException.ThrowIfNull(taskExecutionScriptingManager); + ArgumentNullException.ThrowIfNull(batchTesEventMessageFactory); this.logger = logger; this.azureProxy = azureProxy; @@ -144,8 +147,9 @@ public BatchScheduler( this.globalManagedIdentity = batchNodesOptions.Value.GlobalManagedIdentity; this.allowedVmSizesService = allowedVmSizesService; this.taskExecutionScriptingManager = taskExecutionScriptingManager; + this.batchTesEventMessageFactory = batchTesEventMessageFactory; - _batchPoolFactory = poolFactory; + batchPoolFactory = poolFactory; batchPrefix = batchSchedulingOptions.Value.Prefix; logger.LogInformation("BatchPrefix: {BatchPrefix}", batchPrefix); File.ReadAllLines(Path.Combine(AppContext.BaseDirectory, "scripts/task-run.sh")); @@ -345,7 +349,7 @@ public async Task LoadExistingPoolsAsync(CancellationToken cancellationToken) { try { - var batchPool = _batchPoolFactory.CreateNew(); + var batchPool = batchPoolFactory.CreateNew(); await batchPool.AssignPoolAsync(cloudPool, cancellationToken); } catch (Exception exc) @@ -1372,6 +1376,39 @@ private static Dictionary DelimitedTextToDictionary(string text, .Select(line => { var parts = line.Split(fieldDelimiter); return new KeyValuePair(parts[0], parts[1]); }) .ToDictionary(kv => kv.Key, kv => kv.Value); + /// + public async IAsyncEnumerable GetEventMessages([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken, string @event = null) + { + var path = "events"; + + if (!string.IsNullOrWhiteSpace(@event)) + { + path += "/" + @event; + } + + Uri directoryUri = new(await storageAccessProvider.GetInternalTesBlobUrlAsync(path, cancellationToken)); + var accountSegments = StorageAccountUrlSegments.Create(directoryUri.ToString()); + + await foreach (var blobItem in azureProxy.ListBlobsWithTagsAsync(directoryUri, new Dictionary() { { TesEventMessage.ProcessedTag, string.Empty } }, cancellationToken) + .WithCancellation(cancellationToken)) + { + if (blobItem.Tags.ContainsKey(TesEventMessage.ProcessedTag)) + { + continue; + } + + UriBuilder builder = new(directoryUri) + { + Path = $"{accountSegments.ContainerName}/{blobItem.BlobName}" + }; + + var pathUnderDirectory = builder.Path[accountSegments.BlobName.Length..]; + var eventName = pathUnderDirectory[..pathUnderDirectory.IndexOf('/')]; + + yield return batchTesEventMessageFactory.CreateNew(builder.Uri, blobItem.Tags, eventName); + } + } + /// /// Class that captures how transitions from current state to the new state, given the current Batch task state and optional condition. /// Transitions typically include an action that needs to run in order for the task to move to the new state. diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 56e4c57f4..36ff2247b 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -15,7 +15,6 @@ using Polly.Retry; using Tes.ApiClients; using Tes.ApiClients.Options; -using TesApi.Web.Management.Configuration; using TesApi.Web.Storage; using BatchModels = Microsoft.Azure.Management.Batch.Models; @@ -209,5 +208,12 @@ public async Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, boo /// public Task EvaluateAutoScaleAsync(string poolId, string autoscaleFormula, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.EvaluateAutoScaleAsync(poolId, autoscaleFormula, ct), cancellationToken); + + /// + public IAsyncEnumerable ListBlobsWithTagsAsync(Uri directoryUri, IDictionary tagsQuery, CancellationToken cancellationToken) + => cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListBlobsWithTagsAsync(directoryUri, tagsQuery, cancellationToken), cachingRetryHandler.RetryPolicy); + + /// + public Task SetBlobTags(Uri blobAbsoluteUri, IDictionary tags, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.SetBlobTags(blobAbsoluteUri, tags, ct), cancellationToken); } } diff --git a/src/TesApi.Web/IAzureProxy.cs b/src/TesApi.Web/IAzureProxy.cs index 137c14dd7..58ff82716 100644 --- a/src/TesApi.Web/IAzureProxy.cs +++ b/src/TesApi.Web/IAzureProxy.cs @@ -148,6 +148,24 @@ public interface IAzureProxy /// Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken); + /// + /// List blobs whose tags match a given search expression in the given directory. + /// + /// Directory Uri + /// Tags and values to exactly match (case sensitive). + /// A for controlling the lifetime of the asynchronous operation. + /// + IAsyncEnumerable ListBlobsWithTagsAsync(Uri directoryUri, IDictionary tagsQuery, CancellationToken cancellationToken); + + /// + /// Sets tags on the underlying blob. + /// + /// Absolute Blob URI + /// The tags to set on the blob. + /// A for controlling the lifetime of the asynchronous operation. + /// + Task SetBlobTags(Uri blobAbsoluteUri, IDictionary tags, CancellationToken cancellationToken); + /// /// Gets the list of active pools matching the hostname in the metadata /// diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index 1d0ecc690..d27e5867e 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -69,7 +69,7 @@ public interface IBatchScheduler /// Deletes pool and job. /// /// - /// + /// A for controlling the lifetime of the asynchronous operation. /// Task DeletePoolAsync(IBatchPool pool, CancellationToken cancellationToken); @@ -91,7 +91,7 @@ public interface IBatchScheduler /// Flushes empty pools to accomodate pool quota limits. /// /// Pool Ids of pools connected to active TES Tasks. Used to prevent accidentally removing active pools. - /// + /// A for controlling the lifetime of the asynchronous operation. /// ValueTask FlushPoolsAsync(IEnumerable assignedPools, CancellationToken cancellationToken); @@ -101,5 +101,13 @@ public interface IBatchScheduler /// . /// . string GetTesTaskIdFromCloudTaskId(string cloudTaskId); + + /// + /// Gets unprocessed events from the storage account. + /// + /// A for controlling the lifetime of the asynchronous operation. + /// Optional event to retrieve. + /// + IAsyncEnumerable GetEventMessages(CancellationToken cancellationToken, string @event = default); } } diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index 1e8e74adf..c8919c8b1 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -2,6 +2,7 @@ // Licensed under the MIT License. using System; +using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Threading; @@ -153,18 +154,93 @@ private Task ExecuteUpdateTesTaskFromEventBlobAsync(CancellationToken stoppingTo ///
/// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. /// - ValueTask UpdateTesTasksFromEventBlobsAsync(CancellationToken stoppingToken) + async ValueTask UpdateTesTasksFromEventBlobsAsync(CancellationToken stoppingToken) { - return ValueTask.CompletedTask; + var messageInfos = new List(); + var messages = new ConcurrentBag(); - //// Get and parse event blobs + // Get and parse event blobs + await foreach (var message in batchScheduler.GetEventMessages(stoppingToken).WithCancellation(stoppingToken)) + { + messageInfos.Add(message); + } - //// Get TesTask for each blob + try + { + await Parallel.ForEachAsync(messageInfos, ProcessMessage); + } + catch { } // TODO: identify exceptions + + // Update TesTasks + await OrchestrateTesTasksOnBatchAsync( + "NodeEvent", +#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously + async token => GetTesTasks(token), +#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, messages.Select(GetCompletedBatchState).ToArray(), token), + stoppingToken); - //// Update TesTasks - //await OrchestrateTesTasksOnBatchAsync("NodeEvent", query, (tasks, cancellationToken) => batchScheduler.MethodToBeWrittenAsync(tasks, events, cancellationToken), stoppingToken); + // Helpers + async ValueTask ProcessMessage(TesEventMessage messageInfo, CancellationToken cancellationToken) + { + // TODO: remove the switch (keeping the message retrieval) when GetCompletedBatchState can process the rest + switch (messageInfo.Event) + { + case "taskCompleted": + messages.Add(await messageInfo.GetMessageAsync(cancellationToken)); + break; + + default: + break; + } + + await messageInfo.MarkMessageProcessed(cancellationToken); + } + + async IAsyncEnumerable GetTesTasks([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + { + foreach (var id in messages.Select(t => batchScheduler.GetTesTaskIdFromCloudTaskId(t.EntityId))) + { + TesTask tesTask = default; + if (await repository.TryGetItemAsync(id, cancellationToken, task => tesTask = task) && tesTask is not null) + { + logger.LogDebug("Completing task {TesTask}.", tesTask.Id); + yield return tesTask; + } + else + { + logger.LogDebug("Could not find task {TesTask}.", id); + yield return null; + } + } + } - //// Delete blobs + AzureBatchTaskState GetCompletedBatchState(Tes.Runner.Events.EventMessage task) + { + logger.LogDebug("Getting batch task state from event {EventName} for {TesTask}.", task.Name, task.EntityId); + return task.Name switch + { + "taskCompleted" => string.IsNullOrWhiteSpace(task.EventData["errorMessage"]) + + ? new( + AzureBatchTaskState.TaskState.CompletedSuccessfully, + BatchTaskStartTime: task.Created - TimeSpan.Parse(task.EventData["duration"]), + BatchTaskEndTime: task.Created/*, + BatchTaskExitCode: 0*/) + + : new( + AzureBatchTaskState.TaskState.CompletedWithErrors, + Failure: new("ExecutorError", + Enumerable.Empty() + .Append(task.EventData["errorMessage"])), + BatchTaskStartTime: task.Created - TimeSpan.Parse(task.EventData["duration"]), + BatchTaskEndTime: task.Created/*, + BatchTaskExitCode: 0*/), + + // TODO: the rest + _ => throw new System.Diagnostics.UnreachableException(), + }; + } } } } diff --git a/src/TesApi.Web/Startup.cs b/src/TesApi.Web/Startup.cs index 2fb283ff1..3065dbe63 100644 --- a/src/TesApi.Web/Startup.cs +++ b/src/TesApi.Web/Startup.cs @@ -80,6 +80,8 @@ public void ConfigureServices(IServiceCollection services) .AddSingleton, TesRepositoryCache>() .AddSingleton() .AddSingleton() + .AddTransient() + .AddSingleton() .AddTransient() .AddSingleton() .AddSingleton(CreateBatchPoolManagerFromConfiguration) diff --git a/src/TesApi.Web/TesApi.Web.csproj b/src/TesApi.Web/TesApi.Web.csproj index ea889c7b5..129cc398e 100644 --- a/src/TesApi.Web/TesApi.Web.csproj +++ b/src/TesApi.Web/TesApi.Web.csproj @@ -15,7 +15,7 @@ - + @@ -63,6 +63,7 @@ + diff --git a/src/TesApi.Web/TesEventMessage.cs b/src/TesApi.Web/TesEventMessage.cs new file mode 100644 index 000000000..dcf7a81b4 --- /dev/null +++ b/src/TesApi.Web/TesEventMessage.cs @@ -0,0 +1,120 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using TesApi.Web.Storage; + +namespace TesApi.Web +{ + /// + /// Represents the events sent by the node task runner. + /// + public class TesEventMessage + { + static TesEventMessage() => Tes.Utilities.NewtonsoftJsonSafeInit.SetDefaultSettings(); + + /// + /// Blob tag used to record event processing. + /// + public const string ProcessedTag = "processed"; + + private readonly IStorageAccessProvider _storageAccessProvider; + private readonly IAzureProxy _azureProxy; + private readonly ILogger _logger; + private readonly Uri _uri; + + /// + /// Tags of this event message. + /// + public IDictionary Tags { get; } + + /// + /// Event of this event message. + /// + public string Event { get; } + + /// + /// Constructor of . + /// + /// + /// + /// + /// + /// + /// + public TesEventMessage(IAzureProxy azureProxy, ILogger logger, IStorageAccessProvider storageAccessProvider, Uri blobAbsoluteUri, IDictionary tags, string @event) + { + ArgumentNullException.ThrowIfNull(azureProxy); + ArgumentNullException.ThrowIfNull(storageAccessProvider); + ArgumentNullException.ThrowIfNull(blobAbsoluteUri); + ArgumentNullException.ThrowIfNull(tags); + ArgumentNullException.ThrowIfNull(@event); + + if (tags.ContainsKey(ProcessedTag)) + { + throw new ArgumentException("This message was already processed.", nameof(tags)); + } + + _azureProxy = azureProxy; + _logger = logger; + _storageAccessProvider = storageAccessProvider; + _uri = blobAbsoluteUri; + Tags = tags.AsReadOnly(); + Event = @event; + } + + /// + /// Gets the details of this event message. + /// + /// + /// + public async Task GetMessageAsync(CancellationToken cancellationToken) + { + var messageText = await _azureProxy.DownloadBlobAsync(_uri, cancellationToken); + var result = Newtonsoft.Json.JsonConvert.DeserializeObject(messageText); + // TODO: throw if null + return result; + } + + /// + /// Marks this event message processed. + /// + /// + /// + public async Task MarkMessageProcessed(CancellationToken cancellationToken) + { + var uri = await _storageAccessProvider.MapLocalPathToSasUrlAsync(_uri.ToString(), cancellationToken); + await _azureProxy.SetBlobTags(new Uri(uri), Tags.Append(new KeyValuePair(ProcessedTag, DateTime.UtcNow.ToString("O"))).ToDictionary(pair => pair.Key, pair => pair.Value), cancellationToken); + } + } + + /// + /// Factory to create TesEventMessage instances. + /// + public sealed class BatchTesEventMessageFactory + { + private readonly IServiceProvider _serviceProvider; + + /// + /// Constructor for . + /// + /// A service object. + public BatchTesEventMessageFactory(IServiceProvider serviceProvider) => _serviceProvider = serviceProvider; + + /// + /// Creates a new . + /// + /// + /// + /// + /// + public TesEventMessage CreateNew(Uri blobAbsoluteUri, IDictionary tags, string @event) + => ActivatorUtilities.CreateInstance(_serviceProvider, blobAbsoluteUri, tags, @event); + } +} From 21e619965a08566d798b45aefecdc58c9ac2e504 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 9 Oct 2023 15:07:44 -0700 Subject: [PATCH 027/202] Cleanup --- src/TesApi.Web/AzureProxy.cs | 4 +- src/TesApi.Web/BatchPool.cs | 17 +++-- src/TesApi.Web/BatchPoolService.cs | 7 +- src/TesApi.Web/BatchScheduler.cs | 70 +++++++++---------- .../CachingWithRetriesAzureProxy.cs | 3 +- .../SystemThreadingTaskExtensions.cs | 21 +----- src/TesApi.Web/IAzureProxy.cs | 5 +- .../OrchestrateOnBatchSchedulerServiceBase.cs | 9 +-- 8 files changed, 56 insertions(+), 80 deletions(-) diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index eecee4814..ee8b65f37 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -547,7 +547,7 @@ public Task DisableBatchPoolAutoScaleAsync(string poolId, CancellationToken canc => batchClient.PoolOperations.DisableAutoScaleAsync(poolId, cancellationToken: cancellationToken); /// - public async Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeSpan interval, IAzureProxy.BatchPoolAutoScaleFormulaFactory formulaFactory, CancellationToken cancellationToken) + public async Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeSpan interval, IAzureProxy.BatchPoolAutoScaleFormulaFactory formulaFactory, Func> currentTargetFunc, CancellationToken cancellationToken) { var (allocationState, _, _, currentLowPriority, _, currentDedicated) = await GetFullAllocationStateAsync(poolId, cancellationToken); @@ -556,7 +556,7 @@ public async Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, throw new InvalidOperationException(); } - var formula = formulaFactory(preemptable, preemptable ? currentLowPriority ?? 0 : currentDedicated ?? 0); + var formula = formulaFactory(preemptable, await currentTargetFunc(preemptable ? currentLowPriority ?? 0 : currentDedicated ?? 0)); logger.LogDebug("Setting Pool {PoolID} to AutoScale({AutoScaleInterval}): '{AutoScaleFormula}'", poolId, interval, formula.Replace(Environment.NewLine, @"\n")); await batchClient.PoolOperations.EnableAutoScaleAsync(poolId, formula, interval, cancellationToken: cancellationToken); } diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index f55839117..76183c6cc 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -4,7 +4,6 @@ using System; using System.Collections.Generic; using System.Linq; -using System.Runtime.CompilerServices; using System.Threading; using System.Threading.Tasks; using Azure; @@ -257,7 +256,11 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc { switch (_scalingMode) { - case ScalingMode.AutoScaleEnabled: + case ScalingMode.AutoScaleEnabled when autoScaleEnabled != true: + _scalingMode = ScalingMode.RemovingFailedNodes; + break; + + case ScalingMode.AutoScaleEnabled when autoScaleEnabled == true: if (_resetAutoScalingRequired || await GetNodesToRemove(false).AnyAsync(cancellationToken)) { _logger.LogInformation(@"Switching pool {PoolId} to manual scale to clear resize errors and/or compute nodes in invalid states.", Id); @@ -318,7 +321,7 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc ResizeErrors.Clear(); _resizeErrorsRetrieved = true; _logger.LogInformation(@"Switching pool {PoolId} back to autoscale.", Id); - await _azureProxy.EnableBatchPoolAutoScaleAsync(Id, !IsDedicated, AutoScaleEvaluationInterval, (p, t) => AutoPoolFormula(p, GetTaskCount(t)), cancellationToken); + await _azureProxy.EnableBatchPoolAutoScaleAsync(Id, !IsDedicated, AutoScaleEvaluationInterval, AutoPoolFormula, GetTaskCountAsync, cancellationToken); _autoScaleWaitTime = DateTime.UtcNow + (3 * AutoScaleEvaluationInterval) + BatchPoolService.RunInterval; _scalingMode = _resetAutoScalingRequired ? ScalingMode.WaitingForAutoScale : ScalingMode.SettingAutoScale; break; @@ -337,11 +340,11 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc break; } - int GetTaskCount(int @default) // Used to make reenabling auto-scale more performant by attempting to gather the current number of "pending" tasks, falling back on the current target. + async ValueTask GetTaskCountAsync(int @default) // Used to make reenabling auto-scale more performant by attempting to gather the current number of "pending" tasks, falling back on the current target. { try { - return GetTasksAsync(includeCompleted: false).CountAsync(cancellationToken).AsTask().Result; + return await GetTasksAsync(includeCompleted: false).CountAsync(cancellationToken); } catch { @@ -474,7 +477,7 @@ internal async ValueTask ServicePoolAsync(ServiceKind serviceKind, CancellationT } /// - public async IAsyncEnumerable<(string taskId, AzureBatchTaskState)> ServicePoolAsync([EnumeratorCancellation] CancellationToken cancellationToken) + public async IAsyncEnumerable<(string taskId, AzureBatchTaskState)> ServicePoolAsync([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { var exceptions = new List(); @@ -527,7 +530,7 @@ async ValueTask PerformTask(ValueTask serviceAction, CancellationToken can } #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously - async IAsyncEnumerable GetFailures([EnumeratorCancellation] CancellationToken cancellationToken) + async IAsyncEnumerable GetFailures([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously { for (var failure = PopNextStartTaskFailure(); failure is not null; failure = PopNextStartTaskFailure()) diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index 7e389e798..cb52c8866 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -5,7 +5,6 @@ using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; -using System.Runtime.CompilerServices; using System.Threading; using System.Threading.Tasks; using Microsoft.Azure.Batch; @@ -108,7 +107,7 @@ private async ValueTask ExecuteServiceBatchPoolsAsync(CancellationToken stopping await OrchestrateTesTasksOnBatchAsync( "Failures", #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously - async token => list.Select(t => t.TesTask).ToAsyncEnumerable(), + async _ => list.Select(t => t.TesTask).ToAsyncEnumerable(), #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, list.Select(t => t.State).ToArray(), token), stoppingToken); @@ -150,12 +149,12 @@ private async ValueTask ProcessCompletedCloudTasksAsync(CancellationToken stoppi await OrchestrateTesTasksOnBatchAsync( "Completed", #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously - async token => GetTesTasks(token), + async cancellationToken => GetTesTasks(cancellationToken), #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, tasks.Select(GetCompletedBatchState).ToArray(), token), stoppingToken); - async IAsyncEnumerable GetTesTasks([EnumeratorCancellation] CancellationToken cancellationToken) + async IAsyncEnumerable GetTesTasks([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { foreach (var id in tasks.Select(t => batchScheduler.GetTesTaskIdFromCloudTaskId(t.Id))) { diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 00a6559e7..754dd10e0 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -15,7 +15,6 @@ using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Tes.Extensions; -using Tes.Models; using TesApi.Web.Extensions; using TesApi.Web.Management; using TesApi.Web.Management.Models.Quotas; @@ -33,7 +32,7 @@ namespace TesApi.Web { /// - /// Orchestrates s on Azure Batch + /// Orchestrates s on Azure Batch /// public partial class BatchScheduler : IBatchScheduler { @@ -79,16 +78,16 @@ public partial class BatchScheduler : IBatchScheduler private readonly TaskExecutionScriptingManager taskExecutionScriptingManager; /// - /// Orchestrates s on Azure Batch + /// Constructor for /// /// Logger - /// Configuration of - /// Configuration of - /// Configuration of - /// Configuration of - /// Configuration of - /// Configuration of - /// Configuration of + /// Configuration of + /// Configuration of + /// Configuration of + /// Configuration of + /// Configuration of + /// Configuration of + /// Configuration of /// Azure proxy /// Storage access provider /// Quota verifier > @@ -97,7 +96,7 @@ public partial class BatchScheduler : IBatchScheduler /// Batch pool factory /// Service to get allowed vm sizes. /// - /// /param> + /// public BatchScheduler( ILogger logger, IOptions batchGen1Options, @@ -232,30 +231,27 @@ async Task SetTaskExecutorError(TesTask tesTask, CombinedBatchTaskInfo bat return await SetTaskStateAndLog(tesTask, TesState.EXECUTORERROREnum, batchInfo, cancellationToken); } - //async Task SetTaskSystemError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) - //{ - // await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - // return await SetTaskStateAndLog(tesTask, TesState.SYSTEMERROREnum, batchInfo, cancellationToken); - //} + async Task SetTaskSystemError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) + { + await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + return await SetTaskStateAndLog(tesTask, TesState.SYSTEMERROREnum, batchInfo, cancellationToken); + } - async Task DeleteBatchTaskAndSetTaskStateAsync(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) + async Task SetTaskStateAfterFailureAsync(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); return await SetTaskStateAndLog(tesTask, newTaskState, batchInfo, cancellationToken); } - Task DeleteBatchTaskAndSetTaskExecutorErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) => DeleteBatchTaskAndSetTaskStateAsync(tesTask, TesState.EXECUTORERROREnum, batchInfo, cancellationToken); - Task DeleteBatchTaskAndSetTaskSystemErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) => DeleteBatchTaskAndSetTaskStateAsync(tesTask, TesState.SYSTEMERROREnum, batchInfo, cancellationToken); - - Task DeleteBatchTaskAndRequeueTaskAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) + Task RequeueTaskAfterFailureAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) => ++tesTask.ErrorCount > 3 - ? AddSystemLogAndDeleteBatchTaskAndSetTaskExecutorErrorAsync(tesTask, batchInfo, "System Error: Retry count exceeded.", cancellationToken) - : DeleteBatchTaskAndSetTaskStateAsync(tesTask, TesState.QUEUEDEnum, batchInfo, cancellationToken); + ? AddSystemLogAndSetTaskExecutorErrorAsync(tesTask, batchInfo, "System Error: Retry count exceeded.", cancellationToken) + : SetTaskStateAfterFailureAsync(tesTask, TesState.QUEUEDEnum, batchInfo, cancellationToken); - Task AddSystemLogAndDeleteBatchTaskAndSetTaskExecutorErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, string alternateSystemLogItem, CancellationToken cancellationToken) + Task AddSystemLogAndSetTaskExecutorErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, string alternateSystemLogItem, CancellationToken cancellationToken) { batchInfo = new(new(batchInfo.State, batchInfo.OutputFileLogs, new(batchInfo.Failure.Reason, (batchInfo.Failure.SystemLogs ?? Enumerable.Empty()).Append(alternateSystemLogItem)), batchInfo.CloudTaskCreationTime, batchInfo.BatchTaskStartTime, batchInfo.BatchTaskEndTime, batchInfo.BatchTaskExitCode), batchInfo.AlternateSystemLogItem); - return DeleteBatchTaskAndSetTaskExecutorErrorAsync(tesTask, batchInfo, cancellationToken); + return SetTaskExecutorError(tesTask, batchInfo, cancellationToken); } Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) @@ -272,14 +268,14 @@ Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batch //new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.JobNotFound, alternateSystemLogItem: null, (tesTask, _, ct) => AddBatchTaskAsync(tesTask, ct)), //new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.MissingBatchTask, alternateSystemLogItem: null, (tesTask, _, ct) => AddBatchTaskAsync(tesTask, ct)), new TesTaskStateTransition(tesTaskIsQueued, AzureBatchTaskState.TaskState.Initializing, alternateSystemLogItem: null, (tesTask, _) => { tesTask.State = TesState.INITIALIZINGEnum; return true; }), - new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, AzureBatchTaskState.TaskState.NodeAllocationFailed, alternateSystemLogItem: null, DeleteBatchTaskAndRequeueTaskAsync), + new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, AzureBatchTaskState.TaskState.NodeAllocationFailed, alternateSystemLogItem: null, RequeueTaskAfterFailureAsync), new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, AzureBatchTaskState.TaskState.Running, alternateSystemLogItem: null, (tesTask, _) => { tesTask.State = TesState.RUNNINGEnum; return true; }), //new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.MoreThanOneActiveJobOrTaskFound, BatchTaskState.MoreThanOneActiveJobOrTaskFound.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedSuccessfully, alternateSystemLogItem: null, SetTaskCompleted), new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedWithErrors, "Please open an issue. There should have been an error reported here.", SetTaskExecutorError), //new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.ActiveJobWithMissingAutoPool, alternateSystemLogItem: null, DeleteBatchJobAndRequeueTaskAsync), - new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, "Please open an issue. There should have been an error reported here.", DeleteBatchTaskAndSetTaskSystemErrorAsync), - new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.NodeUnusable, "Please open an issue. There should have been an error reported here.", DeleteBatchTaskAndSetTaskExecutorErrorAsync), + new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, "Please open an issue. There should have been an error reported here.", SetTaskSystemError), + new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.NodeUnusable, "Please open an issue. There should have been an error reported here.", SetTaskExecutorError), //new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.JobNotFound, BatchTaskState.JobNotFound.ToString(), SetTaskSystemError), //new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.MissingBatchTask, BatchTaskState.MissingBatchTask.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodePreempted, alternateSystemLogItem: null, HandlePreemptedNodeAsync) @@ -444,6 +440,13 @@ private static string StandardizeStartTaskPath(string startTaskPath, string defa } } + /// + public string GetTesTaskIdFromCloudTaskId(string cloudTaskId) + { + var separatorIndex = cloudTaskId.LastIndexOf('-'); + return separatorIndex == -1 ? cloudTaskId : cloudTaskId[..separatorIndex]; + } + /// /// Determines if the file is a Cromwell command script /// See https://github.com/broadinstitute/cromwell/blob/17efd599d541a096dc5704991daeaefdd794fefd/supportedBackends/tes/src/main/scala/cromwell/backend/impl/tes/TesTask.scala#L58 @@ -453,13 +456,6 @@ private static string StandardizeStartTaskPath(string startTaskPath, string defa private static bool IsCromwellCommandScript(TesInput inputFile) => (inputFile.Name?.Equals("commandScript") ?? false) && (inputFile.Description?.EndsWith(".commandScript") ?? false) && inputFile.Type == TesFileType.FILEEnum && inputFile.Path.EndsWith($"/{CromwellScriptFileName}"); - /// - public string GetTesTaskIdFromCloudTaskId(string cloudTaskId) - { - var separatorIndex = cloudTaskId.LastIndexOf('-'); - return separatorIndex == -1 ? cloudTaskId : cloudTaskId[..separatorIndex]; - } - /// public async IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { @@ -1300,7 +1296,7 @@ static bool TryGetValueAsDouble(Dictionary dict, string key, out return dict.TryGetValue(key, out var valueAsString) && double.TryParse(valueAsString, out result); } - BatchNodeMetrics batchNodeMetrics = null; + Tes.Models.BatchNodeMetrics batchNodeMetrics = null; DateTimeOffset? taskStartTime = null; DateTimeOffset? taskEndTime = null; int? cromwellRcCode = null; @@ -1339,7 +1335,7 @@ static bool TryGetValueAsDouble(Dictionary dict, string key, out var diskSizeInGB = TryGetValueAsDouble(metrics, "DiskSizeInKiB", out var diskSizeInKiB) ? diskSizeInKiB / kiBInGB : (double?)null; var diskUsedInGB = TryGetValueAsDouble(metrics, "DiskUsedInKiB", out var diskUsedInKiB) ? diskUsedInKiB / kiBInGB : (double?)null; - batchNodeMetrics = new BatchNodeMetrics + batchNodeMetrics = new Tes.Models.BatchNodeMetrics { BlobXferImagePullDurationInSeconds = GetDurationInSeconds(metrics, "BlobXferPullStart", "BlobXferPullEnd"), ExecutorImagePullDurationInSeconds = GetDurationInSeconds(metrics, "ExecutorPullStart", "ExecutorPullEnd"), diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 36ff2247b..c8363ed1d 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -204,7 +204,8 @@ public async Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, boo public Task DisableBatchPoolAutoScaleAsync(string poolId, CancellationToken cancellationToken) => azureProxy.DisableBatchPoolAutoScaleAsync(poolId, cancellationToken); /// - public Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeSpan interval, IAzureProxy.BatchPoolAutoScaleFormulaFactory formulaFactory, CancellationToken cancellationToken) => azureProxy.EnableBatchPoolAutoScaleAsync(poolId, preemptable, interval, formulaFactory, cancellationToken); + public Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeSpan interval, IAzureProxy.BatchPoolAutoScaleFormulaFactory formulaFactory, Func> currentTargetFunc, CancellationToken cancellationToken) + => azureProxy.EnableBatchPoolAutoScaleAsync(poolId, preemptable, interval, formulaFactory, currentTargetFunc, cancellationToken); /// public Task EvaluateAutoScaleAsync(string poolId, string autoscaleFormula, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.EvaluateAutoScaleAsync(poolId, autoscaleFormula, ct), cancellationToken); diff --git a/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs b/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs index b2f4c2d0f..40148e9a0 100644 --- a/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs +++ b/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs @@ -2,11 +2,8 @@ // Licensed under the MIT License. using System; -using System.Collections; using System.Collections.Generic; -using System.Diagnostics.CodeAnalysis; using System.Linq; -using System.Runtime.CompilerServices; using System.Threading; using System.Threading.Channels; using System.Threading.Tasks; @@ -32,7 +29,7 @@ public static class SystemThreadingTaskExtensions /// A task is sent to the return enumeration when it is "complete", which is when it either completes successfully, fails (queues an exception), or is cancelled.
/// No items in should share an identical instance. /// - public static async IAsyncEnumerable WhenEach(this IEnumerable source, [EnumeratorCancellation] CancellationToken cancellationToken, Func sourceToTask = default) + public static async IAsyncEnumerable WhenEach(this IEnumerable source, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken, Func sourceToTask = default) { ArgumentNullException.ThrowIfNull(source); ArgumentNullException.ThrowIfNull(cancellationToken); @@ -43,17 +40,12 @@ public static async IAsyncEnumerable WhenEach(this IEnumerable source, var list = source.Where(e => e is not null).Select(e => (Entry: e, Task: sourceToTask(e))).ToList(); var pendingCount = list.Count; - if (list.Select(e => e.Task).ToHashSet(new SystemTaskEqualityComparer()).Count != pendingCount) // Check for duplicate tasks - { - throw new ArgumentException("Duplicate System.Threading.Tasks found referenced in collection.", nameof(source)); - } - if (list.Count == 0) { yield break; } - // There should be no more ArgumentExceptions after this point. + // There should be no new ArgumentExceptions after this point. var channel = Channel.CreateBounded(pendingCount); // Add continuations to every task. Those continuations will feed the foreach below @@ -80,14 +72,5 @@ public static async IAsyncEnumerable WhenEach(this IEnumerable source, yield return entry; } } - - private class SystemTaskEqualityComparer : IEqualityComparer - { - public bool Equals(Task x, Task y) - => ReferenceEquals(x, y); - - public int GetHashCode([DisallowNull] Task obj) - => obj?.GetHashCode() ?? 0; - } } } diff --git a/src/TesApi.Web/IAzureProxy.cs b/src/TesApi.Web/IAzureProxy.cs index 58ff82716..d47faa11c 100644 --- a/src/TesApi.Web/IAzureProxy.cs +++ b/src/TesApi.Web/IAzureProxy.cs @@ -267,9 +267,10 @@ public interface IAzureProxy /// Type of compute nodes: false if dedicated, otherwise true. /// The interval for periodic reevaluation of the formula. /// A factory function that generates an auto-scale formula. + /// A function that provides the initial compute node target. The function's argument is the current node target on the pool. /// A for controlling the lifetime of the asynchronous operation. /// - Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeSpan interval, BatchPoolAutoScaleFormulaFactory formulaFactory, CancellationToken cancellationToken); + Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeSpan interval, BatchPoolAutoScaleFormulaFactory formulaFactory, Func> currentTargetFunc, CancellationToken cancellationToken); /// /// Gets the result of evaluating an automatic scaling formula on the specified pool. This @@ -291,7 +292,7 @@ public interface IAzureProxy /// Describes a function to generate autoscale formulas /// /// Type of compute nodes: false if dedicated, otherwise true. - /// Current number of compute nodes. + /// The initial compute node target. /// delegate string BatchPoolAutoScaleFormulaFactory(bool preemptable, int currentTarget); } diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 6f44045af..245f61ab6 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -8,7 +8,6 @@ using System.Threading.Tasks; using Microsoft.Extensions.Hosting; using Microsoft.Extensions.Logging; -using Npgsql.Replication.PgOutput.Messages; using Tes.Extensions; using Tes.Models; using Tes.Repository; @@ -137,7 +136,6 @@ protected async Task ExecuteActionOnIntervalAsync(TimeSpan runInterval, FuncA System.Threading.Tasks.ValueTask that represents the long running operations. protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func>> tesTaskGetter, Func IsModifiedAsync)>> tesTaskProcessor, CancellationToken stoppingToken) { - var pools = new HashSet(); var tesTasks = await (await tesTaskGetter(stoppingToken)).ToArrayAsync(stoppingToken); if (tesTasks.All(task => task is null)) @@ -236,7 +234,6 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< //TODO: retrieve fresh task if possible and add logs to the task in a similar way to the commanted out code block below. //Also: consider doing the same in the other place(s) this exception is caught. } - // TODO catch EF / postgres exception? //catch (Microsoft.Azure.Cosmos.CosmosException exc) //{ // TesTask currentTesTask = default; @@ -262,15 +259,11 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< { logger.LogError(exc, "Updating TES Task '{TesTask}' threw {ExceptionType}: '{ExceptionMessage}'. Stack trace: {ExceptionStackTrace}", tesTask.Id, exc.GetType().FullName, exc.Message, exc.StackTrace); } - - if (!string.IsNullOrWhiteSpace(tesTask.PoolId) && (TesState.INITIALIZINGEnum == tesTask.State || TesState.RUNNINGEnum == tesTask.State)) - { - pools.Add(tesTask.PoolId); - } } if (batchScheduler.NeedPoolFlush) { + var pools = (await repository.GetItemsAsync(task => TesState.INITIALIZINGEnum == task.State || TesState.RUNNINGEnum == task.State, stoppingToken)).Select(task => task.PoolId).Distinct(); await batchScheduler.FlushPoolsAsync(pools, stoppingToken); } From 04a2cf672f04a0fe3ab70e7979ad4c35a400547f Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 9 Oct 2023 17:11:52 -0700 Subject: [PATCH 028/202] Fix blob tags query --- src/Tes.Runner/Events/BlobStorageEventSink.cs | 13 +----------- src/Tes.Runner/Events/EventSink.cs | 20 ++++++++++++------- .../Transfer/BlobBlockApiHttpUtils.cs | 4 ++-- src/TesApi.Web/AzureProxy.cs | 7 ++++--- src/TesApi.Web/BatchScheduler.cs | 12 +++++------ .../OrchestrateOnBatchSchedulerServiceBase.cs | 4 ++-- src/TesApi.Web/Scheduler.cs | 4 ++-- 7 files changed, 29 insertions(+), 35 deletions(-) diff --git a/src/Tes.Runner/Events/BlobStorageEventSink.cs b/src/Tes.Runner/Events/BlobStorageEventSink.cs index 53dc776f6..8fc305613 100644 --- a/src/Tes.Runner/Events/BlobStorageEventSink.cs +++ b/src/Tes.Runner/Events/BlobStorageEventSink.cs @@ -33,7 +33,7 @@ public override async Task HandleEventAsync(EventMessage eventMessage) var content = JsonSerializer.Serialize(eventMessage); await blobBlockApiHttpUtils.ExecuteHttpRequestAsync(() => - BlobBlockApiHttpUtils.CreatePutBlobRequestAsync(ToEventUrl(storageUrl, eventMessage), content, ApiVersion, ToTags(eventMessage))); + BlobBlockApiHttpUtils.CreatePutBlobRequestAsync(ToEventUrl(storageUrl, eventMessage), content, ApiVersion, ToEventTag(eventMessage))); } catch (Exception e) { @@ -58,17 +58,6 @@ private string ToEventUrl(Uri uri, EventMessage message) return blobBuilder.ToUri().ToString(); } - private Dictionary ToTags(EventMessage eventMessage) - { - return new Dictionary - { - { "task-id", eventMessage.EntityId }, - { "workflow-id", eventMessage.CorrelationId }, - { "event-name", eventMessage.Name }, - { "created", eventMessage.Created.ToString(Iso8601DateFormat) } - }; - } - private static string ToBlobName(EventMessage eventMessage) { var blobName = diff --git a/src/Tes.Runner/Events/EventSink.cs b/src/Tes.Runner/Events/EventSink.cs index 2489dc86e..7e4aeea37 100644 --- a/src/Tes.Runner/Events/EventSink.cs +++ b/src/Tes.Runner/Events/EventSink.cs @@ -51,19 +51,25 @@ public async Task StopAsync() await eventHandlerTask.WaitAsync(TimeSpan.FromSeconds(StopWaitDurationInSeconds)); } - protected IDictionary ToEventTag(EventMessage eventMessage) + protected static IDictionary ToEventTag(EventMessage eventMessage) { return new Dictionary { - { "event_name", eventMessage.Name }, - { "event_id", eventMessage.Id }, - { "entity_type", eventMessage.EntityType }, - { "task_id", eventMessage.EntityId }, - { "workflow_id", eventMessage.CorrelationId }, - //format date to ISO 8601, which is URL friendly + { "task-id", eventMessage.EntityId }, + { "workflow-id", eventMessage.CorrelationId }, + { "event-name", eventMessage.Name }, { "created", eventMessage.Created.ToString(Iso8601DateFormat) } + + //{ "event_name", eventMessage.Name }, + //{ "event_id", eventMessage.Id }, + //{ "entity_type", eventMessage.EntityType }, + //{ "task_id", eventMessage.EntityId }, + //{ "workflow_id", eventMessage.CorrelationId }, + ////format date to ISO 8601, which is URL friendly + //{ "created", eventMessage.Created.ToString(Iso8601DateFormat) } }; } + private async Task EventHandlerAsync() { while (await events.Reader.WaitToReadAsync()) diff --git a/src/Tes.Runner/Transfer/BlobBlockApiHttpUtils.cs b/src/Tes.Runner/Transfer/BlobBlockApiHttpUtils.cs index dd584173b..7c8b710e6 100644 --- a/src/Tes.Runner/Transfer/BlobBlockApiHttpUtils.cs +++ b/src/Tes.Runner/Transfer/BlobBlockApiHttpUtils.cs @@ -46,7 +46,7 @@ public static HttpRequestMessage CreatePutBlockRequestAsync(PipelineBuffer buffe } public static HttpRequestMessage CreatePutBlobRequestAsync(string blobUrl, string content, string apiVersion, - Dictionary tags) + IDictionary tags) { var request = new HttpRequestMessage(HttpMethod.Put, blobUrl) { @@ -58,7 +58,7 @@ public static HttpRequestMessage CreatePutBlobRequestAsync(string blobUrl, strin return request; } - private static void AddPutBlobHeaders(HttpRequestMessage request, string apiVersion, Dictionary? tags) + private static void AddPutBlobHeaders(HttpRequestMessage request, string apiVersion, IDictionary? tags) { AddPutBlockHeaders(request, apiVersion); diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index ee8b65f37..f2de060cb 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -417,17 +417,18 @@ public async Task> ListBlobsAsync(Uri directoryUri, Cance var directory = builder.BlobName; builder.BlobName = string.Empty; BlobContainerClient container = new(builder.ToUri()); + var fullTagsQuery = Enumerable.Empty() + .Append(new($"&where=@container='{container.Name}'")) + .Concat(tagsQuery.Select(pair => $"\"{pair.Key}\"='{pair.Value}'")); if (!directory.EndsWith('/')) { directory += "/"; } - return container.FindBlobsByTagsAsync($"&where=@container='{container.Name}' AND {string.Join(" AND", tagsQuery.Select(pair => $"\"{pair.Key}\"='{pair.Value}'"))}", cancellationToken) - .Where(blob => blob.BlobName.StartsWith(directory)); + return container.FindBlobsByTagsAsync(string.Join(" AND", fullTagsQuery), cancellationToken).Where(blob => blob.BlobName.StartsWith(directory)); } - /// public async Task SetBlobTags(Uri blobAbsoluteUri, IDictionary tags, CancellationToken cancellationToken) { diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 754dd10e0..4aff83ecd 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -1376,29 +1376,27 @@ private static Dictionary DelimitedTextToDictionary(string text, public async IAsyncEnumerable GetEventMessages([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken, string @event = null) { var path = "events"; + var tags = new Dictionary(); if (!string.IsNullOrWhiteSpace(@event)) { path += "/" + @event; + tags.Add("event-name", @event); } Uri directoryUri = new(await storageAccessProvider.GetInternalTesBlobUrlAsync(path, cancellationToken)); var accountSegments = StorageAccountUrlSegments.Create(directoryUri.ToString()); - await foreach (var blobItem in azureProxy.ListBlobsWithTagsAsync(directoryUri, new Dictionary() { { TesEventMessage.ProcessedTag, string.Empty } }, cancellationToken) - .WithCancellation(cancellationToken)) + await foreach (var blobItem in azureProxy.ListBlobsWithTagsAsync(directoryUri, tags, cancellationToken).WithCancellation(cancellationToken)) { if (blobItem.Tags.ContainsKey(TesEventMessage.ProcessedTag)) { continue; } - UriBuilder builder = new(directoryUri) - { - Path = $"{accountSegments.ContainerName}/{blobItem.BlobName}" - }; + UriBuilder builder = new(directoryUri) { Path = $"{accountSegments.ContainerName}/{blobItem.BlobName}" }; - var pathUnderDirectory = builder.Path[accountSegments.BlobName.Length..]; + var pathUnderDirectory = builder.Path[(builder.Path.LastIndexOf("/events/") + 8)..]; var eventName = pathUnderDirectory[..pathUnderDirectory.IndexOf('/')]; yield return batchTesEventMessageFactory.CreateNew(builder.Uri, blobItem.Tags, eventName); diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 245f61ab6..409189f9c 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -140,7 +140,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< if (tesTasks.All(task => task is null)) { - logger.LogDebug("OrchestrateTesTasksOnBatch({Poll}) skipped.", pollName); + // Quick return for no tasks return; } @@ -263,7 +263,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< if (batchScheduler.NeedPoolFlush) { - var pools = (await repository.GetItemsAsync(task => TesState.INITIALIZINGEnum == task.State || TesState.RUNNINGEnum == task.State, stoppingToken)).Select(task => task.PoolId).Distinct(); + var pools = (await repository.GetItemsAsync(task => task.State == TesState.INITIALIZINGEnum || task.State == TesState.RUNNINGEnum, stoppingToken)).Select(task => task.PoolId).Distinct(); await batchScheduler.FlushPoolsAsync(pools, stoppingToken); } diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index c8919c8b1..cc273e9d2 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -160,7 +160,7 @@ async ValueTask UpdateTesTasksFromEventBlobsAsync(CancellationToken stoppingToke var messages = new ConcurrentBag(); // Get and parse event blobs - await foreach (var message in batchScheduler.GetEventMessages(stoppingToken).WithCancellation(stoppingToken)) + await foreach (var message in batchScheduler.GetEventMessages(stoppingToken, "taskCompleted").WithCancellation(stoppingToken)) { messageInfos.Add(message); } @@ -183,7 +183,7 @@ await OrchestrateTesTasksOnBatchAsync( // Helpers async ValueTask ProcessMessage(TesEventMessage messageInfo, CancellationToken cancellationToken) { - // TODO: remove the switch (keeping the message retrieval) when GetCompletedBatchState can process the rest + // TODO: remove the switch (keeping the message retrieval) when GetCompletedBatchState can process all events switch (messageInfo.Event) { case "taskCompleted": From 6b7f6f090c8582c7eaa0ecfd57fc0a1eb46514e5 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 9 Oct 2023 18:27:19 -0700 Subject: [PATCH 029/202] Address possible multithreaded issue --- src/TesApi.Web/BatchPoolService.cs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index cb52c8866..fb067e3ec 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -102,7 +102,7 @@ private async ValueTask ExecuteServiceBatchPoolsAsync(CancellationToken stopping { var list = new ConcurrentBag<(TesTask TesTask, AzureBatchTaskState State)>(); - await ExecuteActionOnPoolsAsync("ServiceBatchPools", (pool, token) => ProcessFailures(pool.ServicePoolAsync(token), token), stoppingToken); + await ExecuteActionOnPoolsAsync("Service Batch Pools", (pool, token) => ProcessFailures(pool.ServicePoolAsync(token), token), stoppingToken); await OrchestrateTesTasksOnBatchAsync( "Failures", @@ -142,8 +142,10 @@ private Task ExecuteCompletedTesTasksOnBatchAsync(CancellationToken stoppingToke /// private async ValueTask ProcessCompletedCloudTasksAsync(CancellationToken stoppingToken) { - var tasks = new List(); - await ExecuteActionOnPoolsAsync("ServiceBatchTasks", async (pool, token) => tasks.AddRange(await pool.GetCompletedTasks(token).ToListAsync(token)), stoppingToken); + var tasks = new ConcurrentBag(); + + await ExecuteActionOnPoolsAsync("Service Batch Tasks", async (pool, token) => await pool.GetCompletedTasks(token).ForEachAsync(tasks.Add, token), stoppingToken); + logger.LogDebug("ProcessCompletedCloudTasksAsync found {CompletedTasks} completed tasks.", tasks.Count); await OrchestrateTesTasksOnBatchAsync( From ffcb2ddf9020f1fea4f8cdc7e972f207cc70da1d Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 10 Oct 2023 17:52:34 -0700 Subject: [PATCH 030/202] Restore incorrectly removed file --- nuget.config | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 nuget.config diff --git a/nuget.config b/nuget.config new file mode 100644 index 000000000..08221cbbc --- /dev/null +++ b/nuget.config @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file From e06be2eac7fc02ab4c937dbfec513f505b985203 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 11 Oct 2023 15:22:17 -0700 Subject: [PATCH 031/202] Address deployer issues --- src/deploy-tes-on-azure/Deployer.cs | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/deploy-tes-on-azure/Deployer.cs b/src/deploy-tes-on-azure/Deployer.cs index d2891c260..b7861f518 100644 --- a/src/deploy-tes-on-azure/Deployer.cs +++ b/src/deploy-tes-on-azure/Deployer.cs @@ -237,6 +237,8 @@ await Execute("Connecting to Azure Services...", async () => configuration.EnableIngress = bool.TryParse(enableIngress, out var parsed) ? parsed : null; var tesCredentials = new FileInfo(Path.Combine(Directory.GetCurrentDirectory(), TesCredentialsFileName)); + tesCredentials.Refresh(); + if (configuration.EnableIngress.GetValueOrDefault() && tesCredentials.Exists) { try @@ -327,12 +329,21 @@ await Execute("Connecting to Azure Services...", async () => // Ensure all storage containers are created. await CreateDefaultStorageContainersAsync(storageAccount); + // Always place the compute nodes into the new batch subnet (needed for simplified communication with batch and is faster/cheaper for azure services access). + await AssignMIAsNetworkContributorToResourceAsync(managedIdentity, resourceGroup); + if (string.IsNullOrWhiteSpace(settings["BatchNodesSubnetId"])) { settings["BatchNodesSubnetId"] = await UpdateVnetWithBatchSubnet(); } } + if (installedVersion is null || installedVersion < new Version(4, 7)) + { + // Storage account now requires Storage Blob Data Owner + await AssignVmAsDataOwnerToStorageAccountAsync(managedIdentity, storageAccount); + } + await kubernetesManager.UpgradeValuesYamlAsync(storageAccount, settings); await PerformHelmDeploymentAsync(resourceGroup); } @@ -566,9 +577,9 @@ await Execute( } var maxPerFamilyQuota = batchAccount.DedicatedCoreQuotaPerVMFamilyEnforced ? batchAccount.DedicatedCoreQuotaPerVMFamily.Select(q => q.CoreQuota).Where(q => 0 != q) : Enumerable.Repeat(batchAccount.DedicatedCoreQuota ?? 0, 1); - bool isBatchQuotaAvailable = batchAccount.LowPriorityCoreQuota > 0 || (batchAccount.DedicatedCoreQuota > 0 && maxPerFamilyQuota.Append(0).Max() > 0); - bool isBatchPoolQuotaAvailable = batchAccount.PoolQuota > 0; - bool isBatchJobQuotaAvailable = batchAccount.ActiveJobAndJobScheduleQuota > 0; + var isBatchQuotaAvailable = batchAccount.LowPriorityCoreQuota > 0 || (batchAccount.DedicatedCoreQuota > 0 && maxPerFamilyQuota.Append(0).Max() > 0); + var isBatchPoolQuotaAvailable = batchAccount.PoolQuota > 0; + var isBatchJobQuotaAvailable = batchAccount.ActiveJobAndJobScheduleQuota > 0; var insufficientQuotas = new List(); int exitCode; @@ -583,8 +594,8 @@ await Execute( ConsoleEx.WriteLine("Could not run the test task.", ConsoleColor.Yellow); } - string quotaMessage = string.Join(" and ", insufficientQuotas); - string batchAccountName = configuration.BatchAccountName; + var quotaMessage = string.Join(" and ", insufficientQuotas); + var batchAccountName = configuration.BatchAccountName; ConsoleEx.WriteLine($"Deployment was successful, but Batch account {batchAccountName} does not have sufficient {quotaMessage} quota to run workflows.", ConsoleColor.Yellow); ConsoleEx.WriteLine($"Request Batch {quotaMessage} quota: https://docs.microsoft.com/en-us/azure/batch/batch-quota-limit", ConsoleColor.Yellow); ConsoleEx.WriteLine("After receiving the quota, read the docs to run a test workflow and confirm successful deployment.", ConsoleColor.Yellow); From 6c8f45001828a4d6d733bc89c8a425a48455685c Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 12 Oct 2023 23:10:47 -0700 Subject: [PATCH 032/202] Process all TaskComplete events, only once --- src/Tes.ApiClients/RetryHandler.cs | 62 +++++++++- src/Tes.Runner/Events/EventsPublisher.cs | 4 +- .../TaskExecutionScriptingManagerTests.cs | 4 +- .../TerraStorageAccessProviderTests.cs | 4 +- src/TesApi.Web/AzureProxy.cs | 42 ++++--- src/TesApi.Web/BatchPoolFactory.cs | 1 + src/TesApi.Web/BatchPoolService.cs | 12 +- src/TesApi.Web/BatchScheduler.cs | 110 +++++++++--------- .../CachingWithRetriesAzureProxy.cs | 8 +- .../SystemThreadingTaskExtensions.cs | 7 +- src/TesApi.Web/IAzureProxy.cs | 14 ++- src/TesApi.Web/IBatchScheduler.cs | 4 +- src/TesApi.Web/PagedInterfaceExtensions.cs | 30 ++++- src/TesApi.Web/Scheduler.cs | 43 ++----- .../Storage/DefaultStorageAccessProvider.cs | 105 ++++++++++------- .../Storage/IStorageAccessProvider.cs | 13 ++- .../Storage/StorageAccessProvider.cs | 17 ++- .../Storage/TerraStorageAccessProvider.cs | 56 ++++----- src/TesApi.Web/TesEventMessage.cs | 66 ++++++++++- 19 files changed, 378 insertions(+), 224 deletions(-) diff --git a/src/Tes.ApiClients/RetryHandler.cs b/src/Tes.ApiClients/RetryHandler.cs index 59af57461..85236ef46 100644 --- a/src/Tes.ApiClients/RetryHandler.cs +++ b/src/Tes.ApiClients/RetryHandler.cs @@ -19,6 +19,30 @@ public class RetryHandler private readonly AsyncRetryPolicy asyncRetryPolicy = null!; private readonly AsyncRetryPolicy asyncHttpRetryPolicy = null!; + /// + /// The key in where or is stored. + /// + public const string OnRetryHandlerKey = "OnRetryHandler"; + + /// + /// The action to call on each retry. + /// + /// The handled exception. + /// The current sleep duration. + /// The current retry count. It starts at 1 between the first handled condition and the first wait, then 2, etc. + /// This is called right before the wait. + public delegate void OnRetryHandler(Exception outcome, TimeSpan timespan, int retryCount); + + /// + /// The action to call on each retry. + /// + /// See . + /// The handled exception or result. + /// The current sleep duration. + /// The current retry count. It starts at 1 between the first handled condition and the first wait, then 2, etc. + /// This is called right before the wait. + public delegate void OnRetryHandler(DelegateResult result, TimeSpan timespan, int retryCount); + /// /// Synchronous retry policy instance. /// @@ -32,17 +56,27 @@ public RetryHandler(IOptions retryPolicyOptions) .Handle() .WaitAndRetry(retryPolicyOptions.Value.MaxRetryCount, (attempt) => TimeSpan.FromSeconds(Math.Pow(retryPolicyOptions.Value.ExponentialBackOffExponent, - attempt))); + attempt)), OnRetry); this.asyncRetryPolicy = Policy .Handle() .WaitAndRetryAsync(retryPolicyOptions.Value.MaxRetryCount, (attempt) => TimeSpan.FromSeconds(Math.Pow(retryPolicyOptions.Value.ExponentialBackOffExponent, - attempt))); + attempt)), OnRetry); this.asyncHttpRetryPolicy = HttpPolicyExtensions.HandleTransientHttpError() .OrResult(r => r.StatusCode == HttpStatusCode.TooManyRequests) .WaitAndRetryAsync(retryPolicyOptions.Value.MaxRetryCount, (attempt) => TimeSpan.FromSeconds(Math.Pow(retryPolicyOptions.Value.ExponentialBackOffExponent, - attempt))); + attempt)), OnRetry); + } + + private static void OnRetry(DelegateResult result, TimeSpan span, int retryCount, Context ctx) + { + ctx.GetOnRetryHandler()?.Invoke(result, span, retryCount); + } + + private static void OnRetry(Exception outcome, TimeSpan timespan, int retryCount, Context ctx) + { + ctx.GetOnRetryHandler()?.Invoke(outcome, timespan, retryCount); } /// @@ -132,3 +166,25 @@ public virtual async Task ExecuteHttpRequestWithRetryAsync( return await asyncHttpRetryPolicy.ExecuteAsync(action, cancellationToken); } } + +public static class RetryHandlerExtensions +{ + public static void SetOnRetryHandler(this Context context, Action, TimeSpan, int> onRetry) + { + context[RetryHandler.OnRetryHandlerKey] = onRetry; + } + public static Action, TimeSpan, int>? GetOnRetryHandler(this Context context) + { + return context.TryGetValue(RetryHandler.OnRetryHandlerKey, out var handler) ? (Action, TimeSpan, int>)handler : default; + } + + public static void SetOnRetryHandler(this Context context, Action onRetry) + { + context[RetryHandler.OnRetryHandlerKey] = onRetry; + } + + public static Action? GetOnRetryHandler(this Context context) + { + return context.TryGetValue(RetryHandler.OnRetryHandlerKey, out var handler) ? (Action)handler : default; + } +} diff --git a/src/Tes.Runner/Events/EventsPublisher.cs b/src/Tes.Runner/Events/EventsPublisher.cs index 9bb9a8f45..d6fa30190 100644 --- a/src/Tes.Runner/Events/EventsPublisher.cs +++ b/src/Tes.Runner/Events/EventsPublisher.cs @@ -11,8 +11,8 @@ namespace Tes.Runner.Events; public class EventsPublisher : IAsyncDisposable { - const string EventVersion = "1.0"; - const string EventDataVersion = "1.0"; + public const string EventVersion = "1.0"; + public const string EventDataVersion = "1.0"; public const string TesTaskRunnerEntityType = "TesRunnerTask"; public const string DownloadStartEvent = "downloadStart"; public const string DownloadEndEvent = "downloadEnd"; diff --git a/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs b/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs index 9e650f9a3..44805b299 100644 --- a/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs +++ b/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs @@ -46,8 +46,8 @@ public void SetUp() It.IsAny())) .ReturnsAsync(AssetUrl); storageAccessProviderMock.Setup(x => - x.GetInternalTesBlobUrlAsync(It.IsAny(), - It.IsAny())) + x.GetInternalTesBlobUrlAsync(It.IsAny(), It.IsAny(), + It.IsAny(), It.IsAny(), It.IsAny())) .ReturnsAsync(AssetUrl); taskToNodeTaskConverterMock = new Mock(); diff --git a/src/TesApi.Tests/TerraStorageAccessProviderTests.cs b/src/TesApi.Tests/TerraStorageAccessProviderTests.cs index f723f1ea0..ab0971ab8 100644 --- a/src/TesApi.Tests/TerraStorageAccessProviderTests.cs +++ b/src/TesApi.Tests/TerraStorageAccessProviderTests.cs @@ -29,7 +29,7 @@ public class TerraStorageAccessProviderTests private Mock wsmApiClientMock; private Mock azureProxyMock; - private TerraStorageAccessProvider terraStorageAccessProvider; + private IStorageAccessProvider terraStorageAccessProvider; private TerraApiStubData terraApiStubData; private Mock> optionsMock; private TerraOptions terraOptions; @@ -129,7 +129,7 @@ public async Task GetMappedSasUrlFromWsmAsync_WithOrWithOutBlobName_ReturnsValid SetUpTerraApiClient(); var blobInfo = new TerraBlobInfo(terraApiStubData.GetWorkspaceIdFromContainerName(WorkspaceStorageContainerName), terraApiStubData.ContainerResourceId, TerraApiStubData.WorkspaceStorageContainerName, "blobName"); - var url = await terraStorageAccessProvider.GetMappedSasUrlFromWsmAsync(blobInfo, CancellationToken.None); + var url = await ((TerraStorageAccessProvider)terraStorageAccessProvider).GetMappedSasUrlFromWsmAsync(blobInfo, false, CancellationToken.None); Assert.IsNotNull(url); var uri = new Uri(url); diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index f2de060cb..bd30d9ffe 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -121,6 +121,8 @@ public AzureProxy(IOptions batchAccountOptions, IBatchPoolM //} } + internal AzureProxy() { } // TODO: Remove. Temporary WIP + // TODO: Static method because the instrumentation key is needed in both Program.cs and Startup.cs and we wanted to avoid intializing the batch client twice. // Can we skip initializing app insights with a instrumentation key in Program.cs? If yes, change this to an instance method. /// @@ -333,14 +335,17 @@ public Task GetBatchJobAsync(string jobId, CancellationToken cancellat return (pool.AllocationState, pool.AutoScaleEnabled, pool.TargetLowPriorityComputeNodes, pool.CurrentLowPriorityComputeNodes, pool.TargetDedicatedComputeNodes, pool.CurrentDedicatedComputeNodes); } - private static async Task> GetAccessibleStorageAccountsAsync(CancellationToken cancellationToken) + private static async IAsyncEnumerable GetAccessibleStorageAccountsAsync([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { var azureClient = await GetAzureManagementClientAsync(cancellationToken); - return await (await azureClient.Subscriptions.ListAsync(cancellationToken: cancellationToken)).ToAsyncEnumerable() + + await foreach (var storageAccountInfo in (await azureClient.Subscriptions.ListAsync(cancellationToken: cancellationToken)).ToAsyncEnumerable() .Select(s => s.SubscriptionId).SelectManyAwait(async (subscriptionId, ct) => (await azureClient.WithSubscription(subscriptionId).StorageAccounts.ListAsync(cancellationToken: cancellationToken)).ToAsyncEnumerable() - .Select(a => new StorageAccountInfo { Id = a.Id, Name = a.Name, SubscriptionId = subscriptionId, BlobEndpoint = a.EndPoints.Primary.Blob })) - .ToListAsync(cancellationToken); + .Select(a => new StorageAccountInfo { Id = a.Id, Name = a.Name, SubscriptionId = subscriptionId, BlobEndpoint = a.EndPoints.Primary.Blob }))) + { + yield return storageAccountInfo; + } } /// @@ -411,28 +416,29 @@ public async Task> ListBlobsAsync(Uri directoryUri, Cance } /// - public IAsyncEnumerable ListBlobsWithTagsAsync(Uri directoryUri, IDictionary tagsQuery, CancellationToken cancellationToken) + public IAsyncEnumerable ListBlobsWithTagsAsync(Uri containerUri, string prefix, CancellationToken cancellationToken) + { + BlobContainerClient container = new(containerUri, new(BlobClientOptions.ServiceVersion.V2021_04_10)); + + return container.GetBlobsAsync(Azure.Storage.Blobs.Models.BlobTraits.Tags, Azure.Storage.Blobs.Models.BlobStates.None, prefix, cancellationToken); + } + + /// + public IAsyncEnumerable ListBlobsWithTagsAsync(Uri containerUri, string prefix, IDictionary tagsQuery, CancellationToken cancellationToken) { - BlobUriBuilder builder = new(directoryUri); - var directory = builder.BlobName; - builder.BlobName = string.Empty; - BlobContainerClient container = new(builder.ToUri()); var fullTagsQuery = Enumerable.Empty() - .Append(new($"&where=@container='{container.Name}'")) + //.Append(new($"&where=@container='{container.Name}'")) .Concat(tagsQuery.Select(pair => $"\"{pair.Key}\"='{pair.Value}'")); - if (!directory.EndsWith('/')) - { - directory += "/"; - } + BlobContainerClient container = new(containerUri, new(BlobClientOptions.ServiceVersion.V2021_04_10)); - return container.FindBlobsByTagsAsync(string.Join(" AND", fullTagsQuery), cancellationToken).Where(blob => blob.BlobName.StartsWith(directory)); + return container.FindBlobsByTagsAsync(string.Join(" AND", fullTagsQuery), cancellationToken).Where(blob => blob.BlobName.StartsWith(prefix)); } /// public async Task SetBlobTags(Uri blobAbsoluteUri, IDictionary tags, CancellationToken cancellationToken) { - BlobClient blob = new(blobAbsoluteUri); + BlobClient blob = new(blobAbsoluteUri, new(BlobClientOptions.ServiceVersion.V2021_04_10)); using var result = await blob.SetTagsAsync(tags, cancellationToken: cancellationToken); if (result.IsError) @@ -532,8 +538,8 @@ public async Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, boo /// public async Task GetStorageAccountInfoAsync(string storageAccountName, CancellationToken cancellationToken) - => (await GetAccessibleStorageAccountsAsync(cancellationToken)) - .FirstOrDefault(storageAccount => storageAccount.Name.Equals(storageAccountName, StringComparison.OrdinalIgnoreCase)); + => await GetAccessibleStorageAccountsAsync(cancellationToken) + .FirstOrDefaultAsync(storageAccount => storageAccount.Name.Equals(storageAccountName, StringComparison.OrdinalIgnoreCase), cancellationToken); /// public IAsyncEnumerable ListComputeNodesAsync(string poolId, DetailLevel detailLevel = null) diff --git a/src/TesApi.Web/BatchPoolFactory.cs b/src/TesApi.Web/BatchPoolFactory.cs index ffc25572d..5b5f5a5c8 100644 --- a/src/TesApi.Web/BatchPoolFactory.cs +++ b/src/TesApi.Web/BatchPoolFactory.cs @@ -6,6 +6,7 @@ namespace TesApi.Web { + // TODO: Consider moving this class's implementation to Startup /// /// Factory to create BatchPool instances. /// diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index fb067e3ec..979ac2a4a 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -114,10 +114,10 @@ await OrchestrateTesTasksOnBatchAsync( async ValueTask ProcessFailures(IAsyncEnumerable<(string taskId, AzureBatchTaskState)> failures, CancellationToken cancellationToken) { - await foreach (var (id, state) in failures.WithCancellation(cancellationToken)) + await foreach (var (cloudTaskId, state) in failures.WithCancellation(cancellationToken)) { TesTask tesTask = default; - if (await repository.TryGetItemAsync(batchScheduler.GetTesTaskIdFromCloudTaskId(id), cancellationToken, task => tesTask = task) && tesTask is not null) + if (await repository.TryGetItemAsync(batchScheduler.GetTesTaskIdFromCloudTaskId(cloudTaskId), cancellationToken, task => tesTask = task) && tesTask is not null) { list.Add((tesTask, state)); } @@ -158,17 +158,17 @@ await OrchestrateTesTasksOnBatchAsync( async IAsyncEnumerable GetTesTasks([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { - foreach (var id in tasks.Select(t => batchScheduler.GetTesTaskIdFromCloudTaskId(t.Id))) + foreach (var tesTaskId in tasks.Select(t => batchScheduler.GetTesTaskIdFromCloudTaskId(t.Id))) { TesTask tesTask = default; - if (await repository.TryGetItemAsync(id, cancellationToken, task => tesTask = task) && tesTask is not null) + if (await repository.TryGetItemAsync(tesTaskId, cancellationToken, task => tesTask = task) && tesTask is not null) { logger.LogDebug("Completing task {TesTask}.", tesTask.Id); yield return tesTask; } else { - logger.LogDebug("Could not find task {TesTask}.", id); + logger.LogDebug("Could not find task {TesTask}.", tesTaskId); yield return null; } } @@ -196,7 +196,7 @@ AzureBatchTaskState GetCompletedBatchState(CloudTask task) BatchTaskEndTime: task.ExecutionInformation.EndTime, BatchTaskExitCode: task.ExecutionInformation.ExitCode), - _ => throw new InvalidOperationException(), + _ => throw new System.Diagnostics.UnreachableException(), }; } } diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index e25e2f933..ba0fbfda4 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -182,7 +182,7 @@ public BatchScheduler( async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - var metrics = newTaskState == TesState.COMPLETEEnum + var (batchNodeMetrics, taskStartTime, taskEndTime, cromwellRcCode) = newTaskState == TesState.COMPLETEEnum ? await GetBatchNodeMetricsAndCromwellResultCodeAsync(tesTask, cancellationToken) : default; @@ -191,11 +191,11 @@ async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, Comb var tesTaskLog = tesTask.GetOrAddTesTaskLog(); var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); - tesTaskLog.BatchNodeMetrics = metrics.BatchNodeMetrics; - tesTaskLog.CromwellResultCode = metrics.CromwellRcCode; + tesTaskLog.BatchNodeMetrics = batchNodeMetrics; + tesTaskLog.CromwellResultCode = cromwellRcCode; tesTaskLog.EndTime = DateTime.UtcNow; - tesTaskExecutorLog.StartTime = metrics.TaskStartTime ?? batchInfo.BatchTaskStartTime; - tesTaskExecutorLog.EndTime = metrics.TaskEndTime ?? batchInfo.BatchTaskEndTime; + tesTaskExecutorLog.StartTime = taskStartTime ?? batchInfo.BatchTaskStartTime; + tesTaskExecutorLog.EndTime = taskEndTime ?? batchInfo.BatchTaskEndTime; tesTaskExecutorLog.ExitCode = batchInfo.BatchTaskExitCode; // Only accurate when the task completes successfully, otherwise it's the Batch time as reported from Batch @@ -956,7 +956,7 @@ var s when s.StartsWith("batch.node.centos ") => "sudo yum install epel-release var dockerInDockerIsPublic = true; var executorImageIsPublic = containerRegistryProvider.IsImagePublic(executorImage); - var cromwellDrsIsPublic = drsImageNeeded ? containerRegistryProvider.IsImagePublic(cromwellDrsLocalizerImageName) : true; + var cromwellDrsIsPublic = !drsImageNeeded || containerRegistryProvider.IsImagePublic(cromwellDrsLocalizerImageName); BatchModels.ContainerConfiguration result = default; @@ -974,54 +974,38 @@ var s when s.StartsWith("batch.node.centos ") => "sudo yum install epel-release if (!executorImageIsPublic) { - var containerRegistryInfo = await containerRegistryProvider.GetContainerRegistryInfoAsync(executorImage, cancellationToken); - if (containerRegistryInfo is not null) - { - result.ContainerRegistries.Add(new( - userName: containerRegistryInfo.Username, - registryServer: containerRegistryInfo.RegistryServer, - password: containerRegistryInfo.Password)); - } + _ = await AddRegistryIfNeeded(executorImage); } if (!cromwellDrsIsPublic) { - var containerRegistryInfo = await containerRegistryProvider.GetContainerRegistryInfoAsync(cromwellDrsLocalizerImageName, cancellationToken); - if (containerRegistryInfo is not null && !result.ContainerRegistries.Any(registry => registry.RegistryServer == containerRegistryInfo.RegistryServer)) - { - result.ContainerRegistries.Add(new( - userName: containerRegistryInfo.Username, - registryServer: containerRegistryInfo.RegistryServer, - password: containerRegistryInfo.Password)); - } + _ = await AddRegistryIfNeeded(cromwellDrsLocalizerImageName); } if (result.ContainerRegistries.Count != 0) { - var containerRegistryInfo = await containerRegistryProvider.GetContainerRegistryInfoAsync(dockerInDockerImageName, cancellationToken); - dockerInDockerIsPublic = containerRegistryInfo is null; - if (containerRegistryInfo is not null && !result.ContainerRegistries.Any(registry => registry.RegistryServer == containerRegistryInfo.RegistryServer)) - { - result.ContainerRegistries.Add(new( - userName: containerRegistryInfo.Username, - registryServer: containerRegistryInfo.RegistryServer, - password: containerRegistryInfo.Password)); - } + dockerInDockerIsPublic = await AddRegistryIfNeeded(dockerInDockerImageName); } } - return result is null || result.ContainerRegistries.Count == 0 ? (default, (true, true, true)) : (new() - { - ContainerImageNames = result.ContainerImageNames, - ContainerRegistries = result - .ContainerRegistries - .Select(r => new BatchModels.ContainerRegistry( - userName: r.UserName, - password: r.Password, - registryServer: r.RegistryServer, - identityReference: r.IdentityReference is null ? null : new() { ResourceId = r.IdentityReference.ResourceId })) - .ToList() - }, (executorImageIsPublic, dockerInDockerIsPublic, cromwellDrsIsPublic)); + return result is null || result.ContainerRegistries.Count == 0 ? (default, (true, true, true)) : (result, (executorImageIsPublic, dockerInDockerIsPublic, cromwellDrsIsPublic)); + + async ValueTask AddRegistryIfNeeded(string imageName) + { + var containerRegistryInfo = await containerRegistryProvider.GetContainerRegistryInfoAsync(imageName, cancellationToken); + + if (containerRegistryInfo is not null && !result.ContainerRegistries.Any(registry => registry.RegistryServer == containerRegistryInfo.RegistryServer)) + { + result.ContainerRegistries.Add(new( + userName: containerRegistryInfo.Username, + registryServer: containerRegistryInfo.RegistryServer, + password: containerRegistryInfo.Password)); + + return true; + } + + return false; + } } /// @@ -1342,34 +1326,48 @@ private static Dictionary DelimitedTextToDictionary(string text, .Select(line => { var parts = line.Split(fieldDelimiter); return new KeyValuePair(parts[0], parts[1]); }) .ToDictionary(kv => kv.Key, kv => kv.Value); + /// - public async IAsyncEnumerable GetEventMessages([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken, string @event = null) + public async IAsyncEnumerable GetEventMessagesAsync([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken, string @event) { - var path = "events"; - var tags = new Dictionary(); + const string eventsFolderName = "events"; + + var prefix = eventsFolderName + "/"; + //var tags = new Dictionary(); if (!string.IsNullOrWhiteSpace(@event)) { - path += "/" + @event; - tags.Add("event-name", @event); + prefix += @event + "/"; + //tags.Add("event-name", @event); } - Uri directoryUri = new(await storageAccessProvider.GetInternalTesBlobUrlAsync(path, cancellationToken)); - var accountSegments = StorageAccountUrlSegments.Create(directoryUri.ToString()); + var tesInternalSegments = StorageAccountUrlSegments.Create(storageAccessProvider.GetInternalTesBlobUrlWithoutSasToken(string.Empty)); + var eventsStartIndex = (string.IsNullOrEmpty(tesInternalSegments.BlobName) ? string.Empty : (tesInternalSegments.BlobName + "/")).Length; + var eventsEndIndex = eventsStartIndex + eventsFolderName.Length + 1; + + //await foreach (var blobItem in azureProxy.ListBlobsWithTagsAsync(new(await storageAccessProvider.GetInternalTesBlobUrlAsync(string.Empty, cancellationToken, needsFind: true)), prefix, tags, cancellationToken).WithCancellation(cancellationToken)) + //{ + // if (blobItem.Tags.ContainsKey(TesEventMessage.ProcessedTag) || !blobItem.Tags.ContainsKey("task-id")) + // { + // continue; + // } + + + //} - await foreach (var blobItem in azureProxy.ListBlobsWithTagsAsync(directoryUri, tags, cancellationToken).WithCancellation(cancellationToken)) + await foreach (var blobItem in azureProxy.ListBlobsWithTagsAsync(new(await storageAccessProvider.GetInternalTesBlobUrlAsync(string.Empty, cancellationToken, needsTags: true)), prefix, cancellationToken).WithCancellation(cancellationToken)) { - if (blobItem.Tags.ContainsKey(TesEventMessage.ProcessedTag)) + if (blobItem.Tags.ContainsKey(TesEventMessage.ProcessedTag) || !blobItem.Tags.ContainsKey("task-id")) { continue; } - UriBuilder builder = new(directoryUri) { Path = $"{accountSegments.ContainerName}/{blobItem.BlobName}" }; + var blobUrl = await storageAccessProvider.GetInternalTesBlobUrlAsync(blobItem.Name[eventsStartIndex..], cancellationToken, needsTags: true, needsWrite: true); - var pathUnderDirectory = builder.Path[(builder.Path.LastIndexOf("/events/") + 8)..]; - var eventName = pathUnderDirectory[..pathUnderDirectory.IndexOf('/')]; + var pathFromEventName = blobItem.Name[eventsEndIndex..]; + var eventName = pathFromEventName[..pathFromEventName.IndexOf('/')]; - yield return batchTesEventMessageFactory.CreateNew(builder.Uri, blobItem.Tags, eventName); + yield return batchTesEventMessageFactory.CreateNew(new(blobUrl), blobItem.Tags, eventName); } } diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index c8363ed1d..8dbfe7390 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -211,8 +211,12 @@ public Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeS public Task EvaluateAutoScaleAsync(string poolId, string autoscaleFormula, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.EvaluateAutoScaleAsync(poolId, autoscaleFormula, ct), cancellationToken); /// - public IAsyncEnumerable ListBlobsWithTagsAsync(Uri directoryUri, IDictionary tagsQuery, CancellationToken cancellationToken) - => cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListBlobsWithTagsAsync(directoryUri, tagsQuery, cancellationToken), cachingRetryHandler.RetryPolicy); + public IAsyncEnumerable ListBlobsWithTagsAsync(Uri containerUri, string prefix, CancellationToken cancellationToken) + => cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListBlobsWithTagsAsync(containerUri, prefix, cancellationToken), cachingRetryHandler.RetryPolicy); + + /// + public IAsyncEnumerable ListBlobsWithTagsAsync(Uri containerUri, string prefix, IDictionary tagsQuery, CancellationToken cancellationToken) + => cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListBlobsWithTagsAsync(containerUri, prefix, tagsQuery, cancellationToken), cachingRetryHandler.RetryPolicy); /// public Task SetBlobTags(Uri blobAbsoluteUri, IDictionary tags, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.SetBlobTags(blobAbsoluteUri, tags, ct), cancellationToken); diff --git a/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs b/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs index 40148e9a0..c10aee2b3 100644 --- a/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs +++ b/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs @@ -11,7 +11,7 @@ namespace TesApi.Web.Extensions { /// - /// Extension methods for and associated types + /// Extension methods for and associated types /// public static class SystemThreadingTaskExtensions { @@ -26,8 +26,7 @@ public static class SystemThreadingTaskExtensions /// /// /// - /// A task is sent to the return enumeration when it is "complete", which is when it either completes successfully, fails (queues an exception), or is cancelled.
- /// No items in should share an identical instance. + /// A task is sent to the return enumeration when it is "complete", which is when it either completes successfully, fails (queues an exception), or is cancelled. ///
public static async IAsyncEnumerable WhenEach(this IEnumerable source, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken, Func sourceToTask = default) { @@ -46,7 +45,7 @@ public static async IAsyncEnumerable WhenEach(this IEnumerable source, } // There should be no new ArgumentExceptions after this point. - var channel = Channel.CreateBounded(pendingCount); + var channel = Channel.CreateBounded(new BoundedChannelOptions(pendingCount) { SingleWriter = false, SingleReader = true, AllowSynchronousContinuations = false, FullMode = BoundedChannelFullMode.Wait }); // Add continuations to every task. Those continuations will feed the foreach below _ = Parallel.ForEach(list, tuple => diff --git a/src/TesApi.Web/IAzureProxy.cs b/src/TesApi.Web/IAzureProxy.cs index d47faa11c..fbf61fb4a 100644 --- a/src/TesApi.Web/IAzureProxy.cs +++ b/src/TesApi.Web/IAzureProxy.cs @@ -151,11 +151,21 @@ public interface IAzureProxy /// /// List blobs whose tags match a given search expression in the given directory. /// - /// Directory Uri + /// Container URI + /// + /// A for controlling the lifetime of the asynchronous operation. + /// + IAsyncEnumerable ListBlobsWithTagsAsync(Uri containerUri, string prefix, CancellationToken cancellationToken); + + /// + /// List blobs whose tags match a given search expression in the given directory. + /// + /// Container URI + /// /// Tags and values to exactly match (case sensitive). /// A for controlling the lifetime of the asynchronous operation. /// - IAsyncEnumerable ListBlobsWithTagsAsync(Uri directoryUri, IDictionary tagsQuery, CancellationToken cancellationToken); + IAsyncEnumerable ListBlobsWithTagsAsync(Uri containerUri, string prefix, IDictionary tagsQuery, CancellationToken cancellationToken); /// /// Sets tags on the underlying blob. diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index d27e5867e..92a50944b 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -106,8 +106,8 @@ public interface IBatchScheduler /// Gets unprocessed events from the storage account. /// /// A for controlling the lifetime of the asynchronous operation. - /// Optional event to retrieve. + /// Optional event to retrieve. Defaults to all events. /// - IAsyncEnumerable GetEventMessages(CancellationToken cancellationToken, string @event = default); + IAsyncEnumerable GetEventMessagesAsync(CancellationToken cancellationToken, string @event = default); } } diff --git a/src/TesApi.Web/PagedInterfaceExtensions.cs b/src/TesApi.Web/PagedInterfaceExtensions.cs index 0c7efd779..3acf1b865 100644 --- a/src/TesApi.Web/PagedInterfaceExtensions.cs +++ b/src/TesApi.Web/PagedInterfaceExtensions.cs @@ -59,12 +59,26 @@ public static IAsyncEnumerable ToAsyncEnumerable(this IPage source, Fun /// Policy retrying call to . /// public static IAsyncEnumerable ExecuteAsync(this AsyncRetryPolicy asyncRetryPolicy, Func> func, RetryPolicy retryPolicy) + { + return asyncRetryPolicy.ExecuteAsync(_ => func(), retryPolicy, new()); + } + + /// + /// Adapts calls returning to . + /// + /// Type of results returned in by . + /// Policy retrying calls made while enumerating results returned by . + /// Method returning . + /// Policy retrying call to . + /// An optional . + /// + public static IAsyncEnumerable ExecuteAsync(this AsyncRetryPolicy asyncRetryPolicy, Func> func, RetryPolicy retryPolicy, Polly.Context ctx) { ArgumentNullException.ThrowIfNull(asyncRetryPolicy); ArgumentNullException.ThrowIfNull(func); ArgumentNullException.ThrowIfNull(retryPolicy); - return new PollyAsyncEnumerable((retryPolicy).Execute(() => func()), asyncRetryPolicy); + return new PollyAsyncEnumerable(retryPolicy.Execute(ctx => func(ctx), ctx ??= new()), asyncRetryPolicy, ctx); } #region Implementation classes @@ -101,33 +115,39 @@ private sealed class PollyAsyncEnumerable : IAsyncEnumerable { private readonly IAsyncEnumerable _source; private readonly AsyncRetryPolicy _retryPolicy; + private readonly Polly.Context _ctx; - public PollyAsyncEnumerable(IAsyncEnumerable source, AsyncRetryPolicy retryPolicy) + public PollyAsyncEnumerable(IAsyncEnumerable source, AsyncRetryPolicy retryPolicy, Polly.Context ctx) { ArgumentNullException.ThrowIfNull(source); ArgumentNullException.ThrowIfNull(retryPolicy); + ArgumentNullException.ThrowIfNull(ctx); _source = source; _retryPolicy = retryPolicy; + _ctx = ctx; } IAsyncEnumerator IAsyncEnumerable.GetAsyncEnumerator(CancellationToken cancellationToken) - => new PollyAsyncEnumerator(_source.GetAsyncEnumerator(cancellationToken), _retryPolicy, cancellationToken); + => new PollyAsyncEnumerator(_source.GetAsyncEnumerator(cancellationToken), _retryPolicy, _ctx, cancellationToken); } private sealed class PollyAsyncEnumerator : IAsyncEnumerator { private readonly IAsyncEnumerator _source; private readonly AsyncRetryPolicy _retryPolicy; + private readonly Polly.Context _ctx; private readonly CancellationToken _cancellationToken; - public PollyAsyncEnumerator(IAsyncEnumerator source, AsyncRetryPolicy retryPolicy, CancellationToken cancellationToken) + public PollyAsyncEnumerator(IAsyncEnumerator source, AsyncRetryPolicy retryPolicy, Polly.Context ctx, CancellationToken cancellationToken) { ArgumentNullException.ThrowIfNull(source); ArgumentNullException.ThrowIfNull(retryPolicy); + ArgumentNullException.ThrowIfNull(ctx); _source = source; _retryPolicy = retryPolicy; + _ctx = ctx; _cancellationToken = cancellationToken; } @@ -138,7 +158,7 @@ ValueTask IAsyncDisposable.DisposeAsync() => _source.DisposeAsync(); ValueTask IAsyncEnumerator.MoveNextAsync() - => new(_retryPolicy.ExecuteAsync(ct => _source.MoveNextAsync(ct).AsTask(), _cancellationToken)); + => new(_retryPolicy.ExecuteAsync((ctx, ct) => _source.MoveNextAsync(ct).AsTask(), _ctx, _cancellationToken)); } private sealed class PageEnumerator : EnumeratorEnumerator> diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index cc273e9d2..07363c037 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -156,11 +156,11 @@ private Task ExecuteUpdateTesTaskFromEventBlobAsync(CancellationToken stoppingTo /// async ValueTask UpdateTesTasksFromEventBlobsAsync(CancellationToken stoppingToken) { - var messageInfos = new List(); - var messages = new ConcurrentBag(); + var messageInfos = new ConcurrentBag(); + var messages = new ConcurrentBag<(string Id, AzureBatchTaskState State)>(); // Get and parse event blobs - await foreach (var message in batchScheduler.GetEventMessages(stoppingToken, "taskCompleted").WithCancellation(stoppingToken)) + await foreach (var message in batchScheduler.GetEventMessagesAsync(stoppingToken, Tes.Runner.Events.EventsPublisher.TaskCompletionEvent).WithCancellation(stoppingToken)) { messageInfos.Add(message); } @@ -177,17 +177,17 @@ await OrchestrateTesTasksOnBatchAsync( #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously async token => GetTesTasks(token), #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously - (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, messages.Select(GetCompletedBatchState).ToArray(), token), + (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, messages.Select(t => t.State).ToArray(), token), stoppingToken); // Helpers async ValueTask ProcessMessage(TesEventMessage messageInfo, CancellationToken cancellationToken) { - // TODO: remove the switch (keeping the message retrieval) when GetCompletedBatchState can process all events + // TODO: remove the switch (keeping the message state retrieval) when GetCompletedBatchState can process all events switch (messageInfo.Event) { - case "taskCompleted": - messages.Add(await messageInfo.GetMessageAsync(cancellationToken)); + case Tes.Runner.Events.EventsPublisher.TaskCompletionEvent: + messages.Add(await messageInfo.GetMessageBatchStateAsync(cancellationToken)); break; default: @@ -199,7 +199,7 @@ async ValueTask ProcessMessage(TesEventMessage messageInfo, CancellationToken ca async IAsyncEnumerable GetTesTasks([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { - foreach (var id in messages.Select(t => batchScheduler.GetTesTaskIdFromCloudTaskId(t.EntityId))) + foreach (var id in messages.Select(t => batchScheduler.GetTesTaskIdFromCloudTaskId(t.Id))) { TesTask tesTask = default; if (await repository.TryGetItemAsync(id, cancellationToken, task => tesTask = task) && tesTask is not null) @@ -214,33 +214,6 @@ async IAsyncEnumerable GetTesTasks([System.Runtime.CompilerServices.Enu } } } - - AzureBatchTaskState GetCompletedBatchState(Tes.Runner.Events.EventMessage task) - { - logger.LogDebug("Getting batch task state from event {EventName} for {TesTask}.", task.Name, task.EntityId); - return task.Name switch - { - "taskCompleted" => string.IsNullOrWhiteSpace(task.EventData["errorMessage"]) - - ? new( - AzureBatchTaskState.TaskState.CompletedSuccessfully, - BatchTaskStartTime: task.Created - TimeSpan.Parse(task.EventData["duration"]), - BatchTaskEndTime: task.Created/*, - BatchTaskExitCode: 0*/) - - : new( - AzureBatchTaskState.TaskState.CompletedWithErrors, - Failure: new("ExecutorError", - Enumerable.Empty() - .Append(task.EventData["errorMessage"])), - BatchTaskStartTime: task.Created - TimeSpan.Parse(task.EventData["duration"]), - BatchTaskEndTime: task.Created/*, - BatchTaskExitCode: 0*/), - - // TODO: the rest - _ => throw new System.Diagnostics.UnreachableException(), - }; - } } } } diff --git a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs index ab2db3ae4..fb2d268e0 100644 --- a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs @@ -7,12 +7,11 @@ using System.Threading; using System.Threading.Tasks; using System.Web; +using Azure.Storage; using Azure.Storage.Blobs; +using Azure.Storage.Sas; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; -using Microsoft.WindowsAzure.Storage; -using Microsoft.WindowsAzure.Storage.Auth; -using Microsoft.WindowsAzure.Storage.Blob; using Tes.Extensions; using Tes.Models; using TesApi.Web.Options; @@ -90,7 +89,12 @@ public override async Task IsPublicHttpUrlAsync(string uriString, Cancella } /// - public override async Task MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, TimeSpan? sasTokenDuration = default, bool getContainerSas = false) + public override Task MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, TimeSpan? sasTokenDuration, bool getContainerSas) + { + return MapLocalPathToSasUrlImplAsync(path, sasTokenDuration, getContainerSas, cancellationToken); + } + + private async Task MapLocalPathToSasUrlImplAsync(string path, TimeSpan? sasTokenDuration, bool getContainerSas, CancellationToken cancellationToken) { // TODO: Optional: If path is /container/... where container matches the name of the container in the default storage account, prepend the account name to the path. // This would allow the user to omit the account name for files stored in the default storage account @@ -113,55 +117,76 @@ public override async Task MapLocalPathToSasUrlAsync(string path, Cancel } else { - StorageAccountInfo storageAccountInfo = null; - - if (!await TryGetStorageAccountInfoAsync(pathSegments.AccountName, cancellationToken, info => storageAccountInfo = info)) + try + { + var result = await AddSasTokenAsync(pathSegments, sasTokenDuration, getContainerSas, false, false, true, cancellationToken, path); + return result.ToUriString(); + } + catch { - Logger.LogError($"Could not find storage account '{pathSegments.AccountName}' corresponding to path '{path}'. Either the account does not exist or the TES app service does not have permission to it."); return null; } + } + } - try - { - var accountKey = await AzureProxy.GetStorageAccountKeyAsync(storageAccountInfo, cancellationToken); - var resultPathSegments = new StorageAccountUrlSegments(storageAccountInfo.BlobEndpoint, pathSegments.ContainerName, pathSegments.BlobName); + private async Task AddSasTokenAsync(StorageAccountUrlSegments pathSegments, TimeSpan? sasTokenDuration, bool getContainerSas, bool? needsTags, bool? needsFind, bool? needsWrite, CancellationToken cancellationToken, string path = default) + { + StorageAccountInfo storageAccountInfo = null; - if (pathSegments.IsContainer || getContainerSas) - { - var policy = new SharedAccessBlobPolicy() - { - Permissions = SharedAccessBlobPermissions.Add | SharedAccessBlobPermissions.Create | SharedAccessBlobPermissions.List | SharedAccessBlobPermissions.Read | SharedAccessBlobPermissions.Write, - SharedAccessExpiryTime = DateTime.Now.Add((sasTokenDuration ?? TimeSpan.Zero) + SasTokenDuration) - }; - - var containerUri = new StorageAccountUrlSegments(storageAccountInfo.BlobEndpoint, pathSegments.ContainerName).ToUri(); - resultPathSegments.SasToken = new CloudBlobContainer(containerUri, new StorageCredentials(storageAccountInfo.Name, accountKey)).GetSharedAccessSignature(policy, null, SharedAccessProtocol.HttpsOnly, null); - } - else - { - var policy = new SharedAccessBlobPolicy() { Permissions = SharedAccessBlobPermissions.Read, SharedAccessExpiryTime = DateTime.Now.Add((sasTokenDuration ?? TimeSpan.Zero) + SasTokenDuration) }; - resultPathSegments.SasToken = new CloudBlob(resultPathSegments.ToUri(), new StorageCredentials(storageAccountInfo.Name, accountKey)).GetSharedAccessSignature(policy, null, null, SharedAccessProtocol.HttpsOnly, null); - } + if (!await TryGetStorageAccountInfoAsync(pathSegments.AccountName, cancellationToken, info => storageAccountInfo = info)) + { + Logger.LogError($"Could not find storage account '{pathSegments.AccountName}' corresponding to path '{path}'. Either the account does not exist or the TES app service does not have permission to it."); + throw new InvalidOperationException($"Could not find storage account '{pathSegments.AccountName}' corresponding to path '{path}'."); + } - return resultPathSegments.ToUriString(); + try + { + var accountKey = await AzureProxy.GetStorageAccountKeyAsync(storageAccountInfo, cancellationToken); + var resultPathSegments = new StorageAccountUrlSegments(storageAccountInfo.BlobEndpoint, pathSegments.ContainerName, pathSegments.BlobName); + + var sasBlobPermissions = BlobSasPermissions.Read; + var sasContainerPermissions = BlobContainerSasPermissions.Add | BlobContainerSasPermissions.Create | BlobContainerSasPermissions.List | BlobContainerSasPermissions.Read | BlobContainerSasPermissions.Write; + + if (needsTags.GetValueOrDefault()) + { + sasContainerPermissions |= BlobContainerSasPermissions.Tag; + sasBlobPermissions |= BlobSasPermissions.Tag; + } + + if (pathSegments.IsContainer && needsFind.GetValueOrDefault()) + { + sasContainerPermissions |= BlobContainerSasPermissions.Filter; } - catch (Exception ex) + else if (needsWrite.GetValueOrDefault()) { - Logger.LogError(ex, $"Could not get the key of storage account '{pathSegments.AccountName}'. Make sure that the TES app service has Contributor access to it."); - return null; + sasBlobPermissions |= BlobSasPermissions.Add | BlobSasPermissions.Create | BlobSasPermissions.List | BlobSasPermissions.Write; } + + var expiresOn = DateTimeOffset.UtcNow.Add((sasTokenDuration ?? TimeSpan.Zero) + SasTokenDuration); + var builder = pathSegments.IsContainer || getContainerSas ? new BlobSasBuilder(sasContainerPermissions, expiresOn) : new BlobSasBuilder(sasBlobPermissions, expiresOn); + + builder.BlobContainerName = resultPathSegments.ContainerName; + builder.BlobName = resultPathSegments.BlobName; + builder.Protocol = SasProtocol.Https; + + resultPathSegments.SasToken = builder.ToSasQueryParameters(new StorageSharedKeyCredential(storageAccountInfo.Name, accountKey)).ToString(); + return resultPathSegments; + } + catch (Exception ex) + { + Logger.LogError(ex, $"Could not get the key of storage account '{pathSegments.AccountName}'. Make sure that the TES app service has Contributor access to it."); + throw; } } /// - public override async Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken) + public override async Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken, bool? needsTags, bool? needsFind, bool? needsWrite) { - var normalizedBlobPath = NormalizedBlobPath(blobPath); - - return await MapLocalPathToSasUrlAsync($"/{defaultStorageAccountName}{TesExecutionsPathPrefix}{normalizedBlobPath}", cancellationToken, getContainerSas: true); + var pathSegments = StorageAccountUrlSegments.Create(GetInternalTesBlobUrlWithoutSasToken(blobPath)); + var resultPathSegments = await AddSasTokenAsync(pathSegments, SasTokenDuration, false, needsTags, needsFind, needsWrite, cancellationToken); + return resultPathSegments.ToUriString(); } - private static string NormalizedBlobPath(string blobPath) { return string.IsNullOrEmpty(blobPath) ? string.Empty : $"/{blobPath.TrimStart('/')}"; @@ -206,10 +231,11 @@ public override async Task GetInternalTesTaskBlobUrlAsync(TesTask task, { var blobPathWithPathPrefix = $"/{defaultStorageAccountName}/{task.Resources.GetBackendParameterValue(TesResources.SupportedBackendParameters.internal_path_prefix).Trim('/')}{normalizedBlobPath}"; - return await MapLocalPathToSasUrlAsync(blobPathWithPathPrefix, cancellationToken, getContainerSas: true); + + return await This.MapLocalPathToSasUrlAsync(blobPathWithPathPrefix, cancellationToken, getContainerSas: true); } - return await GetInternalTesBlobUrlAsync($"/{task.Id}{normalizedBlobPath}", cancellationToken); + return await This.GetInternalTesBlobUrlAsync($"/{task.Id}{normalizedBlobPath}", cancellationToken); } private async Task TryGetStorageAccountInfoAsync(string accountName, CancellationToken cancellationToken, Action onSuccess = null) @@ -244,6 +270,5 @@ private bool TryGetExternalStorageAccountInfo(string accountName, string contain return result is not null; } - } } diff --git a/src/TesApi.Web/Storage/IStorageAccessProvider.cs b/src/TesApi.Web/Storage/IStorageAccessProvider.cs index f5d3bedc5..600583e44 100644 --- a/src/TesApi.Web/Storage/IStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/IStorageAccessProvider.cs @@ -84,8 +84,15 @@ public interface IStorageAccessProvider ///
/// /// + /// Optional. Read or write the tags on a blob. + /// Optional. Find blobs with index tags. + /// Optional. Allow changing the blob. /// - public Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken); + /// + /// If the blobPath is not provided(empty), a container SAS token is generated. + /// If the blobPath is provided, a SAS token to the blobPath prefixed with the TES internal segments is generated. + /// + public Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken, bool? needsTags = default, bool? needsFind = default, bool? needsWrite = default); /// /// Returns an Azure Storage Blob URL with a SAS token for the specified blob path in the TES task internal storage location. @@ -94,6 +101,10 @@ public interface IStorageAccessProvider /// /// /// + /// + /// If the blobPath is not provided(empty), a container SAS token is generated. + /// If the blobPath is provided, a SAS token to the blobPath prefixed with the TES task internal segments is generated. + /// public Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, CancellationToken cancellationToken); /// diff --git a/src/TesApi.Web/Storage/StorageAccessProvider.cs b/src/TesApi.Web/Storage/StorageAccessProvider.cs index 8f5a2395e..79420f25b 100644 --- a/src/TesApi.Web/Storage/StorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/StorageAccessProvider.cs @@ -38,16 +38,21 @@ public abstract class StorageAccessProvider : IStorageAccessProvider /// /// Logger /// Azure proxy - public StorageAccessProvider(ILogger logger, IAzureProxy azureProxy) + protected StorageAccessProvider(ILogger logger, IAzureProxy azureProxy) { this.Logger = logger; this.AzureProxy = azureProxy; } + /// + /// Property to assist in directly accessing methods in this instance. + /// + protected IStorageAccessProvider This => this; + /// public async Task DownloadBlobAsync(string blobRelativePath, CancellationToken cancellationToken) { - var url = await MapLocalPathToSasUrlAsync(blobRelativePath, cancellationToken); + var url = await This.MapLocalPathToSasUrlAsync(blobRelativePath, cancellationToken); if (url is null) { @@ -80,7 +85,7 @@ public async Task DownloadBlobAsync(Uri blobAbsoluteUrl, CancellationTok /// public async Task UploadBlobAsync(string blobRelativePath, string content, CancellationToken cancellationToken) - => await this.AzureProxy.UploadBlobAsync(new Uri(await MapLocalPathToSasUrlAsync(blobRelativePath, cancellationToken, getContainerSas: true)), content, cancellationToken); + => await this.AzureProxy.UploadBlobAsync(new Uri(await This.MapLocalPathToSasUrlAsync(blobRelativePath, cancellationToken, getContainerSas: true)), content, cancellationToken); /// public async Task UploadBlobAsync(Uri blobAbsoluteUrl, string content, @@ -93,16 +98,16 @@ public async Task UploadBlobAsync(Uri blobAbsoluteUrl, string content, /// public async Task UploadBlobFromFileAsync(string blobRelativePath, string sourceLocalFilePath, CancellationToken cancellationToken) - => await this.AzureProxy.UploadBlobFromFileAsync(new Uri(await MapLocalPathToSasUrlAsync(blobRelativePath, cancellationToken, getContainerSas: true)), sourceLocalFilePath, cancellationToken); + => await this.AzureProxy.UploadBlobFromFileAsync(new Uri(await This.MapLocalPathToSasUrlAsync(blobRelativePath, cancellationToken, getContainerSas: true)), sourceLocalFilePath, cancellationToken); /// public abstract Task IsPublicHttpUrlAsync(string uriString, CancellationToken cancellationToken); /// - public abstract Task MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, TimeSpan? sasTokenDuration = default, bool getContainerSas = false); + public abstract Task MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, TimeSpan? sasTokenDuration, bool getContainerSas); /// - public abstract Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken); + public abstract Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken, bool? needsTags, bool? needsFind, bool? needsWrite); /// public abstract Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, CancellationToken cancellationToken); diff --git a/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs b/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs index 8598e7659..b495ed56e 100644 --- a/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs @@ -44,7 +44,7 @@ public class TerraStorageAccessProvider : StorageAccessProvider public TerraStorageAccessProvider(TerraWsmApiClient terraWsmApiClient, IAzureProxy azureProxy, IOptions terraOptions, IOptions batchSchedulingOptions, ILogger logger) : base( - logger, azureProxy) + logger, azureProxy) { ArgumentNullException.ThrowIfNull(terraOptions); ArgumentNullException.ThrowIfNull(batchSchedulingOptions); @@ -82,7 +82,7 @@ public override Task IsPublicHttpUrlAsync(string uriString, CancellationTo } /// - public override async Task MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, TimeSpan? sasTokenDuration = default, bool getContainerSas = false) + public override async Task MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, TimeSpan? sasTokenDuration, bool getContainerSas) { ArgumentException.ThrowIfNullOrEmpty(path); if (sasTokenDuration is not null) @@ -99,51 +99,38 @@ public override async Task MapLocalPathToSasUrlAsync(string path, Cancel if (getContainerSas) { - return await GetMappedSasContainerUrlFromWsmAsync(terraBlobInfo, cancellationToken); + return await GetMappedSasContainerUrlFromWsmAsync(terraBlobInfo, false, false, cancellationToken); } - return await GetMappedSasUrlFromWsmAsync(terraBlobInfo, cancellationToken); + return await GetMappedSasUrlFromWsmAsync(terraBlobInfo, false, cancellationToken); } - /// - /// Returns a URL with a SAS token for the provided blobPath.The resulting URL contains the TES internal segments as a prefix to the blobPath. - /// If the blobPath is not provided(empty), a container SAS token is generated. - /// If the blobPath is provided, a SAS token to the blobPath prefixed with the TES internal segments is generated. - /// - /// - /// - /// - public override async Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken) + /// + public override async Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken, bool? needsTags, bool? needsFind, bool? needsWrite) { + // Currently all SAS tokens with Terra are R/W so needsWrite is waiting for a safer future. + var blobInfo = GetTerraBlobInfoForInternalTes(blobPath); if (string.IsNullOrEmpty(blobPath)) { - return await GetMappedSasContainerUrlFromWsmAsync(blobInfo, cancellationToken); + return await GetMappedSasContainerUrlFromWsmAsync(blobInfo, needsTags, needsFind, cancellationToken); } - return await GetMappedSasUrlFromWsmAsync(blobInfo, cancellationToken); + return await GetMappedSasUrlFromWsmAsync(blobInfo, needsTags, cancellationToken); } - /// - /// Returns a URL with a SAS token for the provided blobPath.The resulting URL contains the TES task internal segments as a prefix to the blobPath. - /// If the blobPath is not provided(empty), a container SAS token is generated. - /// If the blobPath is provided, a SAS token to the blobPath prefixed with the TES task internal segments is generated. - /// - /// - /// - /// - /// + /// public override async Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, CancellationToken cancellationToken) { var blobInfo = GetTerraBlobInfoForInternalTesTask(task, blobPath); if (string.IsNullOrEmpty(blobPath)) { - return await GetMappedSasContainerUrlFromWsmAsync(blobInfo, cancellationToken); + return await GetMappedSasContainerUrlFromWsmAsync(blobInfo, false, false, cancellationToken); } - return await GetMappedSasUrlFromWsmAsync(blobInfo, cancellationToken); + return await GetMappedSasUrlFromWsmAsync(blobInfo, false, cancellationToken); } /// @@ -292,9 +279,9 @@ private Guid ToWorkspaceId(string segmentsContainerName) } } - private async Task GetMappedSasContainerUrlFromWsmAsync(TerraBlobInfo blobInfo, CancellationToken cancellationToken) + private async Task GetMappedSasContainerUrlFromWsmAsync(TerraBlobInfo blobInfo, bool? needsTags, bool? needsFind, CancellationToken cancellationToken) { - var tokenInfo = await GetWorkspaceContainerSasTokenFromWsmAsync(blobInfo, cancellationToken); + var tokenInfo = await GetWorkspaceContainerSasTokenFromWsmAsync(blobInfo, needsTags, needsFind, cancellationToken); var urlBuilder = new UriBuilder(tokenInfo.Url); @@ -310,11 +297,12 @@ private async Task GetMappedSasContainerUrlFromWsmAsync(TerraBlobInfo bl /// Returns a Url with a SAS token for the given input /// /// + /// /// A for controlling the lifetime of the asynchronous operation. /// URL with a SAS token - public async Task GetMappedSasUrlFromWsmAsync(TerraBlobInfo blobInfo, CancellationToken cancellationToken) + internal async Task GetMappedSasUrlFromWsmAsync(TerraBlobInfo blobInfo, bool? needsTags, CancellationToken cancellationToken) { - var tokenInfo = await GetWorkspaceBlobSasTokenFromWsmAsync(blobInfo, cancellationToken); + var tokenInfo = await GetWorkspaceBlobSasTokenFromWsmAsync(blobInfo, needsTags, cancellationToken); Logger.LogInformation($"Successfully obtained the Sas Url from Terra. Wsm resource id:{terraOptions.WorkspaceStorageContainerResourceId}"); @@ -338,9 +326,9 @@ private SasTokenApiParameters CreateTokenParamsFromOptions(string blobName, stri sasPermissions, blobName); - private async Task GetWorkspaceBlobSasTokenFromWsmAsync(TerraBlobInfo blobInfo, CancellationToken cancellationToken) + private async Task GetWorkspaceBlobSasTokenFromWsmAsync(TerraBlobInfo blobInfo, bool? needsTags, CancellationToken cancellationToken) { - var tokenParams = CreateTokenParamsFromOptions(blobInfo.BlobName, SasBlobPermissions); + var tokenParams = CreateTokenParamsFromOptions(blobInfo.BlobName, SasBlobPermissions + (needsTags.GetValueOrDefault() ? "t" : string.Empty)); Logger.LogInformation( $"Getting Sas Url from Terra. Wsm workspace id:{blobInfo.WorkspaceId}"); @@ -351,10 +339,10 @@ private async Task GetWorkspaceBlobSasTokenFromWsmAsync( tokenParams, cancellationToken); } - private async Task GetWorkspaceContainerSasTokenFromWsmAsync(TerraBlobInfo blobInfo, CancellationToken cancellationToken) + private async Task GetWorkspaceContainerSasTokenFromWsmAsync(TerraBlobInfo blobInfo, bool? needsTags, bool? needsFind, CancellationToken cancellationToken) { // an empty blob name gets a container Sas token - var tokenParams = CreateTokenParamsFromOptions(blobName: "", SasContainerPermissions); + var tokenParams = CreateTokenParamsFromOptions(blobName: "", SasContainerPermissions + (needsTags.GetValueOrDefault() ? "t" : string.Empty) + (needsFind.GetValueOrDefault() ? "l" : string.Empty)); Logger.LogInformation( $"Getting Sas container Url from Terra. Wsm workspace id:{blobInfo.WorkspaceId}"); diff --git a/src/TesApi.Web/TesEventMessage.cs b/src/TesApi.Web/TesEventMessage.cs index dcf7a81b4..d5ffa4a05 100644 --- a/src/TesApi.Web/TesEventMessage.cs +++ b/src/TesApi.Web/TesEventMessage.cs @@ -15,6 +15,7 @@ namespace TesApi.Web /// /// Represents the events sent by the node task runner. /// + /// This should be transient in DI. public class TesEventMessage { static TesEventMessage() => Tes.Utilities.NewtonsoftJsonSafeInit.SetDefaultSettings(); @@ -56,11 +57,23 @@ public TesEventMessage(IAzureProxy azureProxy, ILogger logger, ArgumentNullException.ThrowIfNull(tags); ArgumentNullException.ThrowIfNull(@event); + if (tags.Count == 0) + { + throw new ArgumentException("This message has no tags.", nameof(tags)); + } + if (tags.ContainsKey(ProcessedTag)) { throw new ArgumentException("This message was already processed.", nameof(tags)); } + // There are up to 10 tags allowed. We will be adding one. + // https://learn.microsoft.com/azure/storage/blobs/storage-manage-find-blobs?tabs=azure-portal#setting-blob-index-tags + if (tags.Count > 9) + { + throw new ArgumentException("This message does not have space to add the processed tag.", nameof(tags)); + } + _azureProxy = azureProxy; _logger = logger; _storageAccessProvider = storageAccessProvider; @@ -74,12 +87,26 @@ public TesEventMessage(IAzureProxy azureProxy, ILogger logger, ///
/// /// - public async Task GetMessageAsync(CancellationToken cancellationToken) + public async Task<(string Id, AzureBatchTaskState State)> GetMessageBatchStateAsync(CancellationToken cancellationToken) { var messageText = await _azureProxy.DownloadBlobAsync(_uri, cancellationToken); var result = Newtonsoft.Json.JsonConvert.DeserializeObject(messageText); + // TODO: throw if null - return result; + // Validate. Suggestions include: + //Guid.TryParse(result.Id, out _) + //Tes.Runner.Events.EventsPublisher.EventVersion.Equals(result.EventVersion, StringComparison.Ordinal) + //Tes.Runner.Events.EventsPublisher.EventDataVersion.Equals(result.EventDataVersion, StringComparison.Ordinal) + //Tes.Runner.Events.EventsPublisher.TesTaskRunnerEntityType.Equals(result.EntityType, StringComparison.Ordinal) + //Tes.Runner.Events.EventsPublisher.TesTaskRunnerEntityType.Equals(result.EntityType, StringComparison.Ordinal) + //Event.Equals(result.Name, StringComparison.Ordinal) + //new[] { Tes.Runner.Events.EventsPublisher.StartedStatus, Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage) + + // Event type specific validations + // + + _logger.LogDebug("Getting batch task state from event {EventName} for {TesTask}.", result.Name ?? Event, result.EntityId); + return (result.EntityId, GetCompletedBatchState(result)); } /// @@ -89,14 +116,45 @@ public TesEventMessage(IAzureProxy azureProxy, ILogger logger, /// public async Task MarkMessageProcessed(CancellationToken cancellationToken) { - var uri = await _storageAccessProvider.MapLocalPathToSasUrlAsync(_uri.ToString(), cancellationToken); - await _azureProxy.SetBlobTags(new Uri(uri), Tags.Append(new KeyValuePair(ProcessedTag, DateTime.UtcNow.ToString("O"))).ToDictionary(pair => pair.Key, pair => pair.Value), cancellationToken); + await _azureProxy.SetBlobTags( + _uri, + Tags.Append(new(ProcessedTag, DateTime.UtcNow.ToString("O"))) + .ToDictionary(pair => pair.Key, pair => pair.Value), + cancellationToken); + } + + private /*static*/ AzureBatchTaskState GetCompletedBatchState(Tes.Runner.Events.EventMessage task) + { + return (task.Name ?? Event) switch + { + Tes.Runner.Events.EventsPublisher.TaskCompletionEvent => string.IsNullOrWhiteSpace(task.EventData["errorMessage"]) + + ? new( + AzureBatchTaskState.TaskState.CompletedSuccessfully, + BatchTaskStartTime: task.Created - TimeSpan.Parse(task.EventData["duration"]), + BatchTaskEndTime: task.Created/*, + BatchTaskExitCode: 0*/) + + : new( + AzureBatchTaskState.TaskState.CompletedWithErrors, + Failure: new("ExecutorError", + Enumerable.Empty() + .Append(task.EventData["errorMessage"])), + BatchTaskStartTime: task.Created - TimeSpan.Parse(task.EventData["duration"]), + BatchTaskEndTime: task.Created/*, + BatchTaskExitCode: 0*/), + + // TODO: the rest + _ => new(AzureBatchTaskState.TaskState.NodePreempted), //throw new System.Diagnostics.UnreachableException(), + }; } } + // TODO: Consider moving this class's implementation to Startup /// /// Factory to create TesEventMessage instances. /// + /// This can be a singleton in DI. public sealed class BatchTesEventMessageFactory { private readonly IServiceProvider _serviceProvider; From 3775d245199fc8ae8a785a8c7c1c8d2f5c662fb5 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 13 Oct 2023 11:30:42 -0700 Subject: [PATCH 033/202] Set tasks to Running and some bug fixes --- .../Runner/TaskExecutionScriptingManagerTests.cs | 2 +- src/TesApi.Tests/Runner/TaskToNodeTaskConverterTests.cs | 2 +- .../Storage/DefaultStorageAccessProviderTests.cs | 2 +- src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs | 2 +- src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs | 2 +- src/TesApi.Web/Scheduler.cs | 7 +++++-- src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs | 6 +++--- src/TesApi.Web/Storage/IStorageAccessProvider.cs | 3 ++- src/TesApi.Web/Storage/StorageAccessProvider.cs | 4 ++-- src/TesApi.Web/Storage/TerraStorageAccessProvider.cs | 4 +++- src/TesApi.Web/TesEventMessage.cs | 2 ++ 11 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs b/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs index 44805b299..df72c0f3a 100644 --- a/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs +++ b/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs @@ -43,7 +43,7 @@ public void SetUp() storageAccessProviderMock = new Mock(); storageAccessProviderMock.Setup(x => x.GetInternalTesTaskBlobUrlAsync(It.IsAny(), It.IsAny(), - It.IsAny())) + It.IsAny(), It.IsAny())) .ReturnsAsync(AssetUrl); storageAccessProviderMock.Setup(x => x.GetInternalTesBlobUrlAsync(It.IsAny(), It.IsAny(), diff --git a/src/TesApi.Tests/Runner/TaskToNodeTaskConverterTests.cs b/src/TesApi.Tests/Runner/TaskToNodeTaskConverterTests.cs index 6cf0b8b05..d4fcffab1 100644 --- a/src/TesApi.Tests/Runner/TaskToNodeTaskConverterTests.cs +++ b/src/TesApi.Tests/Runner/TaskToNodeTaskConverterTests.cs @@ -53,7 +53,7 @@ public void SetUp() storageOptions = new StorageOptions() { ExternalStorageContainers = ExternalStorageContainerWithSas }; storageAccessProviderMock = new Mock(); storageAccessProviderMock.Setup(x => - x.GetInternalTesTaskBlobUrlAsync(It.IsAny(), It.IsAny(), It.IsAny())) + x.GetInternalTesTaskBlobUrlAsync(It.IsAny(), It.IsAny(), It.IsAny(), It.IsAny())) .ReturnsAsync(InternalBlobUrlWithSas); storageAccessProviderMock.Setup(x => x.GetInternalTesBlobUrlWithoutSasToken(It.IsAny())) diff --git a/src/TesApi.Tests/Storage/DefaultStorageAccessProviderTests.cs b/src/TesApi.Tests/Storage/DefaultStorageAccessProviderTests.cs index e40f5704a..772ef6894 100644 --- a/src/TesApi.Tests/Storage/DefaultStorageAccessProviderTests.cs +++ b/src/TesApi.Tests/Storage/DefaultStorageAccessProviderTests.cs @@ -20,7 +20,7 @@ namespace TesApi.Tests.Storage [TestClass, TestCategory("Unit")] public class DefaultStorageAccessProviderTests { - private DefaultStorageAccessProvider defaultStorageAccessProvider; + private IStorageAccessProvider defaultStorageAccessProvider; private Mock azureProxyMock; private StorageOptions storageOptions; private StorageAccountInfo storageAccountInfo; diff --git a/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs b/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs index b608ad605..08d8eb73f 100644 --- a/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs +++ b/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs @@ -166,7 +166,7 @@ private async Task UploadContentAsBlobToInternalTesLocationAsync(TesTask string content, string fileName, CancellationToken cancellationToken) { var blobUrl = - await storageAccessProvider.GetInternalTesTaskBlobUrlAsync(tesTask, fileName, cancellationToken); + await storageAccessProvider.GetInternalTesTaskBlobUrlAsync(tesTask, fileName, cancellationToken, needsWrite: true); await storageAccessProvider.UploadBlobAsync(new Uri(blobUrl), content, cancellationToken); return blobUrl; diff --git a/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs b/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs index 3ba1e13f5..6c8575901 100644 --- a/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs +++ b/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs @@ -320,7 +320,7 @@ private async Task UploadContentAndCreateTesInputAsync(TesTask tesTask { var inputFileUrl = await storageAccessProvider.GetInternalTesTaskBlobUrlAsync(tesTask, Guid.NewGuid().ToString(), - cancellationToken); + cancellationToken, needsWrite: true); //return the URL without the SAS token, the runner will add it using the transformation strategy await storageAccessProvider.UploadBlobAsync(new Uri(inputFileUrl), content, cancellationToken); diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index 07363c037..9e69ae223 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -160,7 +160,9 @@ async ValueTask UpdateTesTasksFromEventBlobsAsync(CancellationToken stoppingToke var messages = new ConcurrentBag<(string Id, AzureBatchTaskState State)>(); // Get and parse event blobs - await foreach (var message in batchScheduler.GetEventMessagesAsync(stoppingToken, Tes.Runner.Events.EventsPublisher.TaskCompletionEvent).WithCancellation(stoppingToken)) + await foreach (var message in batchScheduler.GetEventMessagesAsync(stoppingToken, Tes.Runner.Events.EventsPublisher.TaskCompletionEvent) + .Concat(batchScheduler.GetEventMessagesAsync(stoppingToken, Tes.Runner.Events.EventsPublisher.ExecutorStartEvent)) + .WithCancellation(stoppingToken)) { messageInfos.Add(message); } @@ -183,9 +185,10 @@ await OrchestrateTesTasksOnBatchAsync( // Helpers async ValueTask ProcessMessage(TesEventMessage messageInfo, CancellationToken cancellationToken) { - // TODO: remove the switch (keeping the message state retrieval) when GetCompletedBatchState can process all events + // TODO: remove the switch (keeping the message state retrieval) when TesEventMessage.GetMessageBatchStateAsync() can process all events switch (messageInfo.Event) { + case Tes.Runner.Events.EventsPublisher.ExecutorStartEvent: case Tes.Runner.Events.EventsPublisher.TaskCompletionEvent: messages.Add(await messageInfo.GetMessageBatchStateAsync(cancellationToken)); break; diff --git a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs index fb2d268e0..86648eaf0 100644 --- a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs @@ -222,7 +222,7 @@ public override string GetInternalTesBlobUrlWithoutSasToken(string blobPath) } /// - public override async Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, CancellationToken cancellationToken) + public override async Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, CancellationToken cancellationToken, bool? needsWrite) { var normalizedBlobPath = NormalizedBlobPath(blobPath); @@ -232,10 +232,10 @@ public override async Task GetInternalTesTaskBlobUrlAsync(TesTask task, var blobPathWithPathPrefix = $"/{defaultStorageAccountName}/{task.Resources.GetBackendParameterValue(TesResources.SupportedBackendParameters.internal_path_prefix).Trim('/')}{normalizedBlobPath}"; - return await This.MapLocalPathToSasUrlAsync(blobPathWithPathPrefix, cancellationToken, getContainerSas: true); + return await This.MapLocalPathToSasUrlAsync(blobPathWithPathPrefix, cancellationToken, getContainerSas: needsWrite ?? false); } - return await This.GetInternalTesBlobUrlAsync($"/{task.Id}{normalizedBlobPath}", cancellationToken); + return await This.GetInternalTesBlobUrlAsync($"/{task.Id}{normalizedBlobPath}", cancellationToken, needsWrite: needsWrite); } private async Task TryGetStorageAccountInfoAsync(string accountName, CancellationToken cancellationToken, Action onSuccess = null) diff --git a/src/TesApi.Web/Storage/IStorageAccessProvider.cs b/src/TesApi.Web/Storage/IStorageAccessProvider.cs index 600583e44..523291dbe 100644 --- a/src/TesApi.Web/Storage/IStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/IStorageAccessProvider.cs @@ -100,12 +100,13 @@ public interface IStorageAccessProvider /// /// /// + /// Optional. Allow changing the blob. /// /// /// If the blobPath is not provided(empty), a container SAS token is generated. /// If the blobPath is provided, a SAS token to the blobPath prefixed with the TES task internal segments is generated. /// - public Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, CancellationToken cancellationToken); + public Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, CancellationToken cancellationToken, bool? needsWrite = default); /// /// Returns an Azure Storage Blob URL without a SAS token for the specified blob path in the TES task internal storage location. diff --git a/src/TesApi.Web/Storage/StorageAccessProvider.cs b/src/TesApi.Web/Storage/StorageAccessProvider.cs index 79420f25b..d69f63c2a 100644 --- a/src/TesApi.Web/Storage/StorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/StorageAccessProvider.cs @@ -45,7 +45,7 @@ protected StorageAccessProvider(ILogger logger, IAzureProxy azureProxy) } /// - /// Property to assist in directly accessing methods in this instance. + /// Property to assist in directly accessing methods in this instance through the interface. /// protected IStorageAccessProvider This => this; @@ -110,7 +110,7 @@ public async Task UploadBlobFromFileAsync(string blobRelativePath, string source public abstract Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken, bool? needsTags, bool? needsFind, bool? needsWrite); /// - public abstract Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, CancellationToken cancellationToken); + public abstract Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, CancellationToken cancellationToken, bool? needsWrite); /// public abstract string GetInternalTesTaskBlobUrlWithoutSasToken(TesTask task, string blobPath); diff --git a/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs b/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs index b495ed56e..35aa20a5d 100644 --- a/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs @@ -121,8 +121,10 @@ public override async Task GetInternalTesBlobUrlAsync(string blobPath, C } /// - public override async Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, CancellationToken cancellationToken) + public override async Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, CancellationToken cancellationToken, bool? needsWrite) { + // Currently all SAS tokens with Terra are R/W so needsWrite is waiting for a safer future. + var blobInfo = GetTerraBlobInfoForInternalTesTask(task, blobPath); if (string.IsNullOrEmpty(blobPath)) diff --git a/src/TesApi.Web/TesEventMessage.cs b/src/TesApi.Web/TesEventMessage.cs index d5ffa4a05..30cad76d9 100644 --- a/src/TesApi.Web/TesEventMessage.cs +++ b/src/TesApi.Web/TesEventMessage.cs @@ -144,6 +144,8 @@ await _azureProxy.SetBlobTags( BatchTaskEndTime: task.Created/*, BatchTaskExitCode: 0*/), + Tes.Runner.Events.EventsPublisher.ExecutorStartEvent => new(AzureBatchTaskState.TaskState.Running), + // TODO: the rest _ => new(AzureBatchTaskState.TaskState.NodePreempted), //throw new System.Diagnostics.UnreachableException(), }; From c28f50e45304a47272bece3799500e4229f3d6f4 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 13 Oct 2023 13:24:40 -0700 Subject: [PATCH 034/202] Fix security on uploaded blobs --- src/TesApi.Web/BatchScheduler.cs | 2 +- src/TesApi.Web/ConfigurationUtils.cs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index f0bfd0a60..c8d8d0d2d 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -358,7 +358,7 @@ public async Task LoadExistingPoolsAsync(CancellationToken cancellationToken) /// public async Task UploadTaskRunnerIfNeeded(CancellationToken cancellationToken) { - var blobUri = new Uri(await storageAccessProvider.GetInternalTesBlobUrlAsync(NodeTaskRunnerFilename, cancellationToken)); + var blobUri = new Uri(await storageAccessProvider.GetInternalTesBlobUrlAsync(NodeTaskRunnerFilename, cancellationToken, needsWrite: true)); var blobProperties = await azureProxy.GetBlobPropertiesAsync(blobUri, cancellationToken); if (!(await File.ReadAllTextAsync(Path.Combine(AppContext.BaseDirectory, $"scripts/{NodeTaskRunnerMD5HashFilename}"), cancellationToken)).Trim().Equals(blobProperties?.ContentMD5, StringComparison.OrdinalIgnoreCase)) { diff --git a/src/TesApi.Web/ConfigurationUtils.cs b/src/TesApi.Web/ConfigurationUtils.cs index a760b0228..c51f904f4 100644 --- a/src/TesApi.Web/ConfigurationUtils.cs +++ b/src/TesApi.Web/ConfigurationUtils.cs @@ -72,8 +72,8 @@ public ConfigurationUtils( /// public async Task> ProcessAllowedVmSizesConfigurationFileAsync(CancellationToken cancellationToken) { - var supportedVmSizesUrl = new Uri(await storageAccessProvider.GetInternalTesBlobUrlAsync("/configuration/supported-vm-sizes", cancellationToken)); - var allowedVmSizesUrl = new Uri(await storageAccessProvider.GetInternalTesBlobUrlAsync("/configuration/allowed-vm-sizes", cancellationToken)); + var supportedVmSizesUrl = new Uri(await storageAccessProvider.GetInternalTesBlobUrlAsync("/configuration/supported-vm-sizes", cancellationToken, needsWrite: true)); + var allowedVmSizesUrl = new Uri(await storageAccessProvider.GetInternalTesBlobUrlAsync("/configuration/allowed-vm-sizes", cancellationToken, needsWrite: true)); var supportedVmSizes = (await skuInformationProvider.GetVmSizesAndPricesAsync(batchAccountResourceInformation.Region, cancellationToken)).ToList(); var batchAccountQuotas = await quotaProvider.GetVmCoreQuotaAsync(lowPriority: false, cancellationToken: cancellationToken); From c8bf555a70b16a38b0d05593e9482e29d869675c Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 13 Oct 2023 14:35:08 -0700 Subject: [PATCH 035/202] Ensure that certain critical processes stop the app when failing --- .../Repository/PostgreSqlCachingRepository.cs | 4 ++-- .../Repository/TesTaskPostgreSqlRepository.cs | 7 ++++--- src/Tes/Tes.csproj | 1 + src/TesApi.Web/BatchPoolService.cs | 5 +++-- .../OrchestrateOnBatchSchedulerServiceBase.cs | 19 +++++++++++++++---- src/TesApi.Web/Scheduler.cs | 5 +++-- 6 files changed, 28 insertions(+), 13 deletions(-) diff --git a/src/Tes/Repository/PostgreSqlCachingRepository.cs b/src/Tes/Repository/PostgreSqlCachingRepository.cs index 8a8c36811..552bb61c7 100644 --- a/src/Tes/Repository/PostgreSqlCachingRepository.cs +++ b/src/Tes/Repository/PostgreSqlCachingRepository.cs @@ -37,7 +37,7 @@ protected enum WriteAction { Add, Update, Delete } private bool _disposedValue; - protected PostgreSqlCachingRepository(ILogger logger = default, ICache cache = default) + protected PostgreSqlCachingRepository(Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, ILogger logger = default, ICache cache = default) { _logger = logger; _cache = cache; @@ -57,7 +57,7 @@ protected PostgreSqlCachingRepository(ILogger logger = default, ICache cache } await Task.Delay(50); // Give the logger time to flush. - throw new System.Diagnostics.UnreachableException("Repository WriterWorkerAsync unexpectedly ended."); // Force the process to exit via this being an unhandled exception. + hostApplicationLifetime?.StopApplication(); }, TaskContinuationOptions.NotOnCanceled); } diff --git a/src/Tes/Repository/TesTaskPostgreSqlRepository.cs b/src/Tes/Repository/TesTaskPostgreSqlRepository.cs index 5fb345841..e178c4057 100644 --- a/src/Tes/Repository/TesTaskPostgreSqlRepository.cs +++ b/src/Tes/Repository/TesTaskPostgreSqlRepository.cs @@ -26,11 +26,12 @@ public sealed class TesTaskPostgreSqlRepository : PostgreSqlCachingRepository /// Default constructor that also will create the schema if it does not exist /// + /// Used for requesting termination of the current application if the writer task unexpectedly exits. /// /// /// - public TesTaskPostgreSqlRepository(IOptions options, ILogger logger, ICache cache = null) - : base(logger, cache) + public TesTaskPostgreSqlRepository(Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IOptions options, ILogger logger, ICache cache = null) + : base(hostApplicationLifetime, logger, cache) { var connectionString = new ConnectionStringUtility().GetPostgresConnectionString(options); CreateDbContext = () => { return new TesDbContext(connectionString); }; @@ -44,7 +45,7 @@ public TesTaskPostgreSqlRepository(IOptions options, ILogger< /// /// A delegate that creates a TesTaskPostgreSqlRepository context public TesTaskPostgreSqlRepository(Func createDbContext) - : base() + : base(default) { CreateDbContext = createDbContext; using var dbContext = createDbContext(); diff --git a/src/Tes/Tes.csproj b/src/Tes/Tes.csproj index ab248c35b..fea52820f 100644 --- a/src/Tes/Tes.csproj +++ b/src/Tes/Tes.csproj @@ -13,6 +13,7 @@ + diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index 979ac2a4a..61ce934d6 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -29,12 +29,13 @@ internal class BatchPoolService : OrchestrateOnBatchSchedulerServiceBase /// /// Default constructor /// + /// Used for requesting termination of the current application during initialization. /// The main TES task database repository implementation /// /// /// - public BatchPoolService(IRepository repository, IBatchScheduler batchScheduler, ILogger logger) - : base(repository, batchScheduler, logger) { } + public BatchPoolService(Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger logger) + : base(hostApplicationLifetime, repository, batchScheduler, logger) { } /// protected override void ExecuteSetup(CancellationToken stoppingToken) diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 409189f9c..93e710924 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -21,6 +21,7 @@ namespace TesApi.Web ///
internal abstract class OrchestrateOnBatchSchedulerServiceBase : BackgroundService { + private readonly IHostApplicationLifetime hostApplicationLifetime; protected readonly IRepository repository; protected readonly IBatchScheduler batchScheduler; protected readonly ILogger logger; @@ -28,11 +29,13 @@ internal abstract class OrchestrateOnBatchSchedulerServiceBase : BackgroundServi /// /// Default constructor /// + /// Used for requesting termination of the current application. Pass null to allow this service to stop during initialization without taking down the application. /// The main TES task database repository implementation /// The batch scheduler implementation /// The logger instance - protected OrchestrateOnBatchSchedulerServiceBase(IRepository repository, IBatchScheduler batchScheduler, ILogger logger) + protected OrchestrateOnBatchSchedulerServiceBase(IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger logger) { + this.hostApplicationLifetime = hostApplicationLifetime; this.repository = repository; this.batchScheduler = batchScheduler; this.logger = logger; @@ -60,9 +63,17 @@ public override Task StopAsync(CancellationToken cancellationToken) [System.Diagnostics.CodeAnalysis.SuppressMessage("Usage", "CA2254:Template should be a static expression", Justification = "Used to provide service's name in log message.")] protected sealed override async Task ExecuteAsync(CancellationToken stoppingToken) { - // The order of these two calls is critical. - ExecuteSetup(stoppingToken); - await ExecuteSetupAsync(stoppingToken); + try + { + // The order of these two calls is critical. + ExecuteSetup(stoppingToken); + await ExecuteSetupAsync(stoppingToken); + } + catch (Exception ex) when (ex is not OperationCanceledException oce || oce.CancellationToken == CancellationToken.None) + { + logger.LogCritical(ex, "Service {ServiceName} was unable to initialize due to '{Message}'.", GetType().Name, ex.Message); + hostApplicationLifetime?.StopApplication(); + } logger.LogInformation(MarkLogMessage("started.")); diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index 9e69ae223..0c77b5d7a 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -26,11 +26,12 @@ internal class Scheduler : OrchestrateOnBatchSchedulerServiceBase /// /// Default constructor /// + /// Used for requesting termination of the current application during initialization. /// The main TES task database repository implementation /// The batch scheduler implementation /// The logger instance - public Scheduler(IRepository repository, IBatchScheduler batchScheduler, ILogger logger) - : base(repository, batchScheduler, logger) { } + public Scheduler(Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger logger) + : base(hostApplicationLifetime, repository, batchScheduler, logger) { } /// From 33eb1d562fa0febf2d14d3bf17b613a09c7b04c7 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 13 Oct 2023 16:23:40 -0700 Subject: [PATCH 036/202] Run CoA test --- src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs | 3 ++- src/deploy-tes-on-azure/Deployer.cs | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs index 86648eaf0..2ebfa1307 100644 --- a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs @@ -104,6 +104,7 @@ private async Task MapLocalPathToSasUrlImplAsync(string path, TimeSpan? { path = $"/{defaultStorageAccountName}{path}"; } + //TODO: refactor this to throw an exception instead of logging and error and returning null. if (!StorageAccountUrlSegments.TryCreate(path, out var pathSegments)) { @@ -119,7 +120,7 @@ private async Task MapLocalPathToSasUrlImplAsync(string path, TimeSpan? { try { - var result = await AddSasTokenAsync(pathSegments, sasTokenDuration, getContainerSas, false, false, true, cancellationToken, path); + var result = await AddSasTokenAsync(pathSegments, sasTokenDuration, getContainerSas, needsTags: false, needsFind: false, needsWrite: true, cancellationToken, path); return result.ToUriString(); } catch diff --git a/src/deploy-tes-on-azure/Deployer.cs b/src/deploy-tes-on-azure/Deployer.cs index b7861f518..5b5d081f4 100644 --- a/src/deploy-tes-on-azure/Deployer.cs +++ b/src/deploy-tes-on-azure/Deployer.cs @@ -367,6 +367,8 @@ await Execute("Connecting to Azure Services...", async () => postgreSqlFlexServer = await ValidateAndGetExistingPostgresqlServerAsync(); var keyVault = await ValidateAndGetExistingKeyVaultAsync(); + ConsoleEx.WriteLine($"Deploying TES on Azure version {Utility.DelimitedTextToDictionary(Utility.GetFileContent("scripts", "env-00-tes-version.txt")).GetValueOrDefault("TesOnAzureVersion")} into resource group '{resourceGroup.Name}'..."); + // Configuration preferences not currently settable by user. if (string.IsNullOrWhiteSpace(configuration.PostgreSqlServerName) && configuration.ProvisionPostgreSqlOnAzure.GetValueOrDefault()) { From b6c89b2985e9b53832491b0de8fd9eeb6edd0b58 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 13 Oct 2023 17:11:49 -0700 Subject: [PATCH 037/202] Address breaks in the deployer --- src/deploy-tes-on-azure/Deployer.cs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/deploy-tes-on-azure/Deployer.cs b/src/deploy-tes-on-azure/Deployer.cs index 5b5d081f4..1a2b7810a 100644 --- a/src/deploy-tes-on-azure/Deployer.cs +++ b/src/deploy-tes-on-azure/Deployer.cs @@ -344,6 +344,15 @@ await Execute("Connecting to Azure Services...", async () => await AssignVmAsDataOwnerToStorageAccountAsync(managedIdentity, storageAccount); } + // TODO: (purpetually) update the stated version in the next line to the next-to-last version that performed role assignment changes. + if (installedVersion is null || installedVersion < new Version(4, 4) || + // TODO: (purpetually) update the stated version in the next line to one less than the last version that performs role assignment changes. + (installedVersion < new Version(targetVersion) && installedVersion > new Version(4, 6))) + { + ConsoleEx.WriteLine("Waiting 5 minutes for role assignment propagation..."); + await Task.Delay(System.TimeSpan.FromMinutes(5)); + } + await kubernetesManager.UpgradeValuesYamlAsync(storageAccount, settings); await PerformHelmDeploymentAsync(resourceGroup); } @@ -367,7 +376,7 @@ await Execute("Connecting to Azure Services...", async () => postgreSqlFlexServer = await ValidateAndGetExistingPostgresqlServerAsync(); var keyVault = await ValidateAndGetExistingKeyVaultAsync(); - ConsoleEx.WriteLine($"Deploying TES on Azure version {Utility.DelimitedTextToDictionary(Utility.GetFileContent("scripts", "env-00-tes-version.txt")).GetValueOrDefault("TesOnAzureVersion")} into resource group '{resourceGroup.Name}'..."); + ConsoleEx.WriteLine($"Deploying TES on Azure version {Utility.DelimitedTextToDictionary(Utility.GetFileContent("scripts", "env-00-tes-version.txt")).GetValueOrDefault("TesOnAzureVersion")}..."); // Configuration preferences not currently settable by user. if (string.IsNullOrWhiteSpace(configuration.PostgreSqlServerName) && configuration.ProvisionPostgreSqlOnAzure.GetValueOrDefault()) From c177fbe5f06da9f699d49b2745b7cb946947eb7c Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 16 Oct 2023 15:05:12 -0700 Subject: [PATCH 038/202] Removed dead code and changed AzureProxy.ListBlobsAsync --- src/TesApi.Tests/BatchSchedulerTests.cs | 14 ++++---- .../TaskExecutionScriptingManagerTests.cs | 2 +- src/TesApi.Web/AzureProxy.cs | 36 ++++--------------- src/TesApi.Web/BatchScheduler.cs | 19 ++-------- .../CachingWithRetriesAzureProxy.cs | 6 +--- src/TesApi.Web/IAzureProxy.cs | 12 +------ .../Storage/DefaultStorageAccessProvider.cs | 14 +++----- .../Storage/IStorageAccessProvider.cs | 3 +- .../Storage/StorageAccessProvider.cs | 2 +- .../Storage/TerraStorageAccessProvider.cs | 16 ++++----- 10 files changed, 35 insertions(+), 89 deletions(-) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index c5e9e3ef6..d7833b122 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -16,7 +16,6 @@ using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using Microsoft.VisualStudio.TestTools.UnitTesting; -using Microsoft.WindowsAzure.Storage.Blob; using Moq; using Newtonsoft.Json; using Tes.Extensions; @@ -1363,7 +1362,7 @@ public async Task CromwellWriteFilesAreDiscoveredAndAddedIfMissedWithContentScri }; var commandScriptUri = UriFromTesInput(tesTask.Inputs[0]); - var executionDirectoryBlobs = tesTask.Inputs.Select(CloudBlobFromTesInput).ToList(); + var executionDirectoryBlobs = tesTask.Inputs.Select(BlobNameUriFromTesInput).ToList(); var azureProxyReturnValues = AzureProxyReturnValues.Defaults; @@ -1383,13 +1382,13 @@ public async Task CromwellWriteFilesAreDiscoveredAndAddedIfMissedWithContentScri var commandScriptDir = new UriBuilder(commandScriptUri) { Path = Path.GetDirectoryName(commandScriptUri.AbsolutePath).Replace('\\', '/') }.Uri; executionDirectoryUri = UrlMutableSASEqualityComparer.TrimUri(new Uri(storageAccessProvider.MapLocalPathToSasUrlAsync(commandScriptDir.IsFile ? commandScriptDir.AbsolutePath : commandScriptDir.AbsoluteUri, CancellationToken.None, getContainerSas: true).Result)); - serviceProvider.AzureProxy.Setup(p => p.ListBlobsAsync(It.Is(executionDirectoryUri, new UrlMutableSASEqualityComparer()), It.IsAny())).Returns(Task.FromResult>(executionDirectoryBlobs)); + serviceProvider.AzureProxy.Setup(p => p.ListBlobsAsync(It.Is(executionDirectoryUri, new UrlMutableSASEqualityComparer()), It.IsAny())).Returns(executionDirectoryBlobs.ToAsyncEnumerable()); var uri = new UriBuilder(executionDirectoryUri); uri.Path = uri.Path.TrimEnd('/') + $"/{fileName}"; TesInput writeInput = new() { Url = uri.Uri.AbsoluteUri, Path = Path.Combine(Path.GetDirectoryName(script[1]), fileName).Replace('\\', '/'), Type = TesFileType.FILEEnum, Name = "write_", Content = null }; - executionDirectoryBlobs.Add(CloudBlobFromTesInput(writeInput)); + executionDirectoryBlobs.Add(BlobNameUriFromTesInput(writeInput)); if (fileIsInInputs) { @@ -1406,8 +1405,8 @@ public async Task CromwellWriteFilesAreDiscoveredAndAddedIfMissedWithContentScri Assert.AreEqual(2, filesToDownload.Length); }); - static CloudBlob CloudBlobFromTesInput(TesInput input) - => new(UriFromTesInput(input)); + static (string Name, Uri Uri) BlobNameUriFromTesInput(TesInput input) + => (input.Path, UriFromTesInput(input)); static Uri UriFromTesInput(TesInput input) { @@ -1707,6 +1706,9 @@ private static Action> GetMockAzureProxy(AzureProxyReturnValue azureProxy.Setup(a => a.ListTasksAsync(It.IsAny(), It.IsAny())) .Returns(azureProxyReturnValues.AzureProxyListTasks); + + azureProxy.Setup(a => a.ListBlobsAsync(It.IsAny(), It.IsAny())) + .Returns(AsyncEnumerable.Empty<(string, Uri)>()); }; private static Func> GetMockConfig() diff --git a/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs b/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs index df72c0f3a..44606b05d 100644 --- a/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs +++ b/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs @@ -47,7 +47,7 @@ public void SetUp() .ReturnsAsync(AssetUrl); storageAccessProviderMock.Setup(x => x.GetInternalTesBlobUrlAsync(It.IsAny(), It.IsAny(), - It.IsAny(), It.IsAny(), It.IsAny())) + It.IsAny(), It.IsAny())) .ReturnsAsync(AssetUrl); taskToNodeTaskConverterMock = new Mock(); diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index bd30d9ffe..8db309f53 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -9,6 +9,7 @@ using System.Threading; using System.Threading.Tasks; using Azure.Storage.Blobs; +using Azure.Storage.Blobs.Specialized; using Microsoft.Azure.Batch; using Microsoft.Azure.Batch.Auth; using Microsoft.Azure.Batch.Common; @@ -396,23 +397,12 @@ public async Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, Ca } /// - public async Task> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) + public IAsyncEnumerable<(string Name, Uri Uri)> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) { - var blob = new CloudBlockBlob(directoryUri); - var directory = blob.Container.GetDirectoryReference(blob.Name); - - BlobContinuationToken continuationToken = null; - var results = new List(); - - do - { - var response = await directory.ListBlobsSegmentedAsync(useFlatBlobListing: true, blobListingDetails: BlobListingDetails.None, maxResults: null, currentToken: continuationToken, options: null, operationContext: null, cancellationToken: cancellationToken); - continuationToken = response.ContinuationToken; - results.AddRange(response.Results.OfType()); - } - while (continuationToken is not null); - - return results; + var directory = (new BlobClient(directoryUri, new(BlobClientOptions.ServiceVersion.V2021_04_10))); + return directory.GetParentBlobContainerClient() + .GetBlobsAsync(prefix: directory.Name, cancellationToken: cancellationToken) + .Select(blobItem => (blobItem.Name, new BlobUriBuilder(directory.Uri) { Sas = null, BlobName = blobItem.Name }.ToUri())); } /// @@ -420,19 +410,7 @@ public async Task> ListBlobsAsync(Uri directoryUri, Cance { BlobContainerClient container = new(containerUri, new(BlobClientOptions.ServiceVersion.V2021_04_10)); - return container.GetBlobsAsync(Azure.Storage.Blobs.Models.BlobTraits.Tags, Azure.Storage.Blobs.Models.BlobStates.None, prefix, cancellationToken); - } - - /// - public IAsyncEnumerable ListBlobsWithTagsAsync(Uri containerUri, string prefix, IDictionary tagsQuery, CancellationToken cancellationToken) - { - var fullTagsQuery = Enumerable.Empty() - //.Append(new($"&where=@container='{container.Name}'")) - .Concat(tagsQuery.Select(pair => $"\"{pair.Key}\"='{pair.Value}'")); - - BlobContainerClient container = new(containerUri, new(BlobClientOptions.ServiceVersion.V2021_04_10)); - - return container.FindBlobsByTagsAsync(string.Join(" AND", fullTagsQuery), cancellationToken).Where(blob => blob.BlobName.StartsWith(prefix)); + return container.GetBlobsAsync(Azure.Storage.Blobs.Models.BlobTraits.Tags, prefix: prefix, cancellationToken: cancellationToken); } /// diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index c8d8d0d2d..a853997f7 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -790,7 +790,6 @@ private async Task> GetExistingBlobsInCromwellStorageLocationAsTe { var additionalInputFiles = new List(); - if (!Uri.TryCreate(cromwellExecutionDirectoryUrl, UriKind.Absolute, out _)) { cromwellExecutionDirectoryUrl = $"/{cromwellExecutionDirectoryUrl}"; @@ -801,14 +800,14 @@ private async Task> GetExistingBlobsInCromwellStorageLocationAsTe if (executionDirectoryUri is not null) { var blobsInExecutionDirectory = - (await azureProxy.ListBlobsAsync(new Uri(executionDirectoryUri), cancellationToken)).ToList(); + await azureProxy.ListBlobsAsync(new Uri(executionDirectoryUri), cancellationToken).ToListAsync(cancellationToken); var scriptBlob = blobsInExecutionDirectory.FirstOrDefault(b => b.Name.EndsWith($"/{CromwellScriptFileName}")); var commandScript = task.Inputs?.FirstOrDefault( IsCromwellCommandScript); // this should never be null because it's used to set isCromwell - if (scriptBlob is not null) + if (scriptBlob != default) { blobsInExecutionDirectory.Remove(scriptBlob); } @@ -834,7 +833,6 @@ private async Task> GetExistingBlobsInCromwellStorageLocationAsTe } } - return additionalInputFiles; } @@ -1331,30 +1329,17 @@ private static Dictionary DelimitedTextToDictionary(string text, public async IAsyncEnumerable GetEventMessagesAsync([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken, string @event) { const string eventsFolderName = "events"; - var prefix = eventsFolderName + "/"; - //var tags = new Dictionary(); if (!string.IsNullOrWhiteSpace(@event)) { prefix += @event + "/"; - //tags.Add("event-name", @event); } var tesInternalSegments = StorageAccountUrlSegments.Create(storageAccessProvider.GetInternalTesBlobUrlWithoutSasToken(string.Empty)); var eventsStartIndex = (string.IsNullOrEmpty(tesInternalSegments.BlobName) ? string.Empty : (tesInternalSegments.BlobName + "/")).Length; var eventsEndIndex = eventsStartIndex + eventsFolderName.Length + 1; - //await foreach (var blobItem in azureProxy.ListBlobsWithTagsAsync(new(await storageAccessProvider.GetInternalTesBlobUrlAsync(string.Empty, cancellationToken, needsFind: true)), prefix, tags, cancellationToken).WithCancellation(cancellationToken)) - //{ - // if (blobItem.Tags.ContainsKey(TesEventMessage.ProcessedTag) || !blobItem.Tags.ContainsKey("task-id")) - // { - // continue; - // } - - - //} - await foreach (var blobItem in azureProxy.ListBlobsWithTagsAsync(new(await storageAccessProvider.GetInternalTesBlobUrlAsync(string.Empty, cancellationToken, needsTags: true)), prefix, cancellationToken).WithCancellation(cancellationToken)) { if (blobItem.Tags.ContainsKey(TesEventMessage.ProcessedTag) || !blobItem.Tags.ContainsKey("task-id")) diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 8dbfe7390..1d3425d8b 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -154,7 +154,7 @@ public async Task GetStorageAccountInfoAsync(string storageA } /// - public Task> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListBlobsAsync(directoryUri, ct), cancellationToken); + public IAsyncEnumerable<(string Name, Uri Uri)> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) => cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListBlobsAsync(directoryUri, cancellationToken), cachingRetryHandler.RetryPolicy); /// public Task UploadBlobAsync(Uri blobAbsoluteUri, string content, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobAsync(blobAbsoluteUri, content, ct), cancellationToken); @@ -214,10 +214,6 @@ public Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeS public IAsyncEnumerable ListBlobsWithTagsAsync(Uri containerUri, string prefix, CancellationToken cancellationToken) => cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListBlobsWithTagsAsync(containerUri, prefix, cancellationToken), cachingRetryHandler.RetryPolicy); - /// - public IAsyncEnumerable ListBlobsWithTagsAsync(Uri containerUri, string prefix, IDictionary tagsQuery, CancellationToken cancellationToken) - => cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListBlobsWithTagsAsync(containerUri, prefix, tagsQuery, cancellationToken), cachingRetryHandler.RetryPolicy); - /// public Task SetBlobTags(Uri blobAbsoluteUri, IDictionary tags, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.SetBlobTags(blobAbsoluteUri, tags, ct), cancellationToken); } diff --git a/src/TesApi.Web/IAzureProxy.cs b/src/TesApi.Web/IAzureProxy.cs index fbf61fb4a..88e1bca00 100644 --- a/src/TesApi.Web/IAzureProxy.cs +++ b/src/TesApi.Web/IAzureProxy.cs @@ -138,7 +138,7 @@ public interface IAzureProxy /// Directory Uri /// A for controlling the lifetime of the asynchronous operation. /// List of blob paths - Task> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken); + IAsyncEnumerable<(string Name, Uri Uri)> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken); /// /// Fetches the blobs properties @@ -157,16 +157,6 @@ public interface IAzureProxy /// IAsyncEnumerable ListBlobsWithTagsAsync(Uri containerUri, string prefix, CancellationToken cancellationToken); - /// - /// List blobs whose tags match a given search expression in the given directory. - /// - /// Container URI - /// - /// Tags and values to exactly match (case sensitive). - /// A for controlling the lifetime of the asynchronous operation. - /// - IAsyncEnumerable ListBlobsWithTagsAsync(Uri containerUri, string prefix, IDictionary tagsQuery, CancellationToken cancellationToken); - /// /// Sets tags on the underlying blob. /// diff --git a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs index 2ebfa1307..485a2304f 100644 --- a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs @@ -120,7 +120,7 @@ private async Task MapLocalPathToSasUrlImplAsync(string path, TimeSpan? { try { - var result = await AddSasTokenAsync(pathSegments, sasTokenDuration, getContainerSas, needsTags: false, needsFind: false, needsWrite: true, cancellationToken, path); + var result = await AddSasTokenAsync(pathSegments, sasTokenDuration, getContainerSas, needsTags: false, needsWrite: true, cancellationToken: cancellationToken, path: path); return result.ToUriString(); } catch @@ -130,7 +130,7 @@ private async Task MapLocalPathToSasUrlImplAsync(string path, TimeSpan? } } - private async Task AddSasTokenAsync(StorageAccountUrlSegments pathSegments, TimeSpan? sasTokenDuration, bool getContainerSas, bool? needsTags, bool? needsFind, bool? needsWrite, CancellationToken cancellationToken, string path = default) + private async Task AddSasTokenAsync(StorageAccountUrlSegments pathSegments, TimeSpan? sasTokenDuration, bool getContainerSas, bool? needsTags, bool? needsWrite, CancellationToken cancellationToken, string path = default) { StorageAccountInfo storageAccountInfo = null; @@ -154,11 +154,7 @@ private async Task AddSasTokenAsync(StorageAccountUrl sasBlobPermissions |= BlobSasPermissions.Tag; } - if (pathSegments.IsContainer && needsFind.GetValueOrDefault()) - { - sasContainerPermissions |= BlobContainerSasPermissions.Filter; - } - else if (needsWrite.GetValueOrDefault()) + if (needsWrite.GetValueOrDefault()) { sasBlobPermissions |= BlobSasPermissions.Add | BlobSasPermissions.Create | BlobSasPermissions.List | BlobSasPermissions.Write; } @@ -181,10 +177,10 @@ private async Task AddSasTokenAsync(StorageAccountUrl } /// - public override async Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken, bool? needsTags, bool? needsFind, bool? needsWrite) + public override async Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken, bool? needsTags, bool? needsWrite) { var pathSegments = StorageAccountUrlSegments.Create(GetInternalTesBlobUrlWithoutSasToken(blobPath)); - var resultPathSegments = await AddSasTokenAsync(pathSegments, SasTokenDuration, false, needsTags, needsFind, needsWrite, cancellationToken); + var resultPathSegments = await AddSasTokenAsync(pathSegments, SasTokenDuration, false, needsTags, needsWrite, cancellationToken); return resultPathSegments.ToUriString(); } diff --git a/src/TesApi.Web/Storage/IStorageAccessProvider.cs b/src/TesApi.Web/Storage/IStorageAccessProvider.cs index 523291dbe..b1f254ee8 100644 --- a/src/TesApi.Web/Storage/IStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/IStorageAccessProvider.cs @@ -85,14 +85,13 @@ public interface IStorageAccessProvider /// /// /// Optional. Read or write the tags on a blob. - /// Optional. Find blobs with index tags. /// Optional. Allow changing the blob. /// /// /// If the blobPath is not provided(empty), a container SAS token is generated. /// If the blobPath is provided, a SAS token to the blobPath prefixed with the TES internal segments is generated. /// - public Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken, bool? needsTags = default, bool? needsFind = default, bool? needsWrite = default); + public Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken, bool? needsTags = default, bool? needsWrite = default); /// /// Returns an Azure Storage Blob URL with a SAS token for the specified blob path in the TES task internal storage location. diff --git a/src/TesApi.Web/Storage/StorageAccessProvider.cs b/src/TesApi.Web/Storage/StorageAccessProvider.cs index d69f63c2a..77ccb0a50 100644 --- a/src/TesApi.Web/Storage/StorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/StorageAccessProvider.cs @@ -107,7 +107,7 @@ public async Task UploadBlobFromFileAsync(string blobRelativePath, string source public abstract Task MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, TimeSpan? sasTokenDuration, bool getContainerSas); /// - public abstract Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken, bool? needsTags, bool? needsFind, bool? needsWrite); + public abstract Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken, bool? needsTags, bool? needsWrite); /// public abstract Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, CancellationToken cancellationToken, bool? needsWrite); diff --git a/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs b/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs index 35aa20a5d..d2841e601 100644 --- a/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs @@ -99,14 +99,14 @@ public override async Task MapLocalPathToSasUrlAsync(string path, Cancel if (getContainerSas) { - return await GetMappedSasContainerUrlFromWsmAsync(terraBlobInfo, false, false, cancellationToken); + return await GetMappedSasContainerUrlFromWsmAsync(terraBlobInfo, false, cancellationToken); } return await GetMappedSasUrlFromWsmAsync(terraBlobInfo, false, cancellationToken); } /// - public override async Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken, bool? needsTags, bool? needsFind, bool? needsWrite) + public override async Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken, bool? needsTags, bool? needsWrite) { // Currently all SAS tokens with Terra are R/W so needsWrite is waiting for a safer future. @@ -114,7 +114,7 @@ public override async Task GetInternalTesBlobUrlAsync(string blobPath, C if (string.IsNullOrEmpty(blobPath)) { - return await GetMappedSasContainerUrlFromWsmAsync(blobInfo, needsTags, needsFind, cancellationToken); + return await GetMappedSasContainerUrlFromWsmAsync(blobInfo, needsTags, cancellationToken); } return await GetMappedSasUrlFromWsmAsync(blobInfo, needsTags, cancellationToken); @@ -129,7 +129,7 @@ public override async Task GetInternalTesTaskBlobUrlAsync(TesTask task, if (string.IsNullOrEmpty(blobPath)) { - return await GetMappedSasContainerUrlFromWsmAsync(blobInfo, false, false, cancellationToken); + return await GetMappedSasContainerUrlFromWsmAsync(blobInfo, false, cancellationToken); } return await GetMappedSasUrlFromWsmAsync(blobInfo, false, cancellationToken); @@ -281,9 +281,9 @@ private Guid ToWorkspaceId(string segmentsContainerName) } } - private async Task GetMappedSasContainerUrlFromWsmAsync(TerraBlobInfo blobInfo, bool? needsTags, bool? needsFind, CancellationToken cancellationToken) + private async Task GetMappedSasContainerUrlFromWsmAsync(TerraBlobInfo blobInfo, bool? needsTags, CancellationToken cancellationToken) { - var tokenInfo = await GetWorkspaceContainerSasTokenFromWsmAsync(blobInfo, needsTags, needsFind, cancellationToken); + var tokenInfo = await GetWorkspaceContainerSasTokenFromWsmAsync(blobInfo, needsTags, cancellationToken); var urlBuilder = new UriBuilder(tokenInfo.Url); @@ -341,10 +341,10 @@ private async Task GetWorkspaceBlobSasTokenFromWsmAsync( tokenParams, cancellationToken); } - private async Task GetWorkspaceContainerSasTokenFromWsmAsync(TerraBlobInfo blobInfo, bool? needsTags, bool? needsFind, CancellationToken cancellationToken) + private async Task GetWorkspaceContainerSasTokenFromWsmAsync(TerraBlobInfo blobInfo, bool? needsTags, CancellationToken cancellationToken) { // an empty blob name gets a container Sas token - var tokenParams = CreateTokenParamsFromOptions(blobName: "", SasContainerPermissions + (needsTags.GetValueOrDefault() ? "t" : string.Empty) + (needsFind.GetValueOrDefault() ? "l" : string.Empty)); + var tokenParams = CreateTokenParamsFromOptions(blobName: "", SasContainerPermissions + (needsTags.GetValueOrDefault() ? "t" : string.Empty)); Logger.LogInformation( $"Getting Sas container Url from Terra. Wsm workspace id:{blobInfo.WorkspaceId}"); From 0bc5d01fbeb2c4085e1651a3565cfaf88a18ef6a Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 16 Oct 2023 20:47:50 -0700 Subject: [PATCH 039/202] Address failure in CoA --- src/TesApi.Web/AzureProxy.cs | 2 +- src/TesApi.Web/BatchScheduler.cs | 24 ++++++++----------- .../Storage/DefaultStorageAccessProvider.cs | 2 +- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 8db309f53..57824efe7 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -401,7 +401,7 @@ public async Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, Ca { var directory = (new BlobClient(directoryUri, new(BlobClientOptions.ServiceVersion.V2021_04_10))); return directory.GetParentBlobContainerClient() - .GetBlobsAsync(prefix: directory.Name, cancellationToken: cancellationToken) + .GetBlobsAsync(prefix: directory.Name + "/", cancellationToken: cancellationToken) .Select(blobItem => (blobItem.Name, new BlobUriBuilder(directory.Uri) { Sas = null, BlobName = blobItem.Name }.ToUri())); } diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index a853997f7..d6f8fd2d1 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -765,7 +765,7 @@ private string GetNodeManagedIdentityResourceId(TesTask task) return globalManagedIdentity; } - private async Task> GetAdditionalCromwellInputsAsync(TesTask task, CancellationToken cancellationToken) + private async Task> GetAdditionalCromwellInputsAsync(TesTask task, CancellationToken cancellationToken) { var cromwellExecutionDirectoryUrl = GetCromwellExecutionDirectoryPathAsUrl(task); var isCromwell = cromwellExecutionDirectoryUrl is not null; @@ -814,22 +814,18 @@ private async Task> GetExistingBlobsInCromwellStorageLocationAsTe if (commandScript is not null) { - var commandScriptPathParts = commandScript.Path.Split('/').ToList(); - var cromwellExecutionDirectory = - string.Join('/', commandScriptPathParts.Take(commandScriptPathParts.Count - 1)); - additionalInputFiles = await blobsInExecutionDirectory - .Select(b => (Path: $"/{cromwellExecutionDirectory.TrimStart('/')}/{b.Name.Split('/').Last()}", - b.Uri)) - .ToAsyncEnumerable() - .SelectAwait(async b => new TesInput + var expectedPathParts = commandScript.Path.Split('/').Length; + + additionalInputFiles = blobsInExecutionDirectory + .Where(b => b.Name.Split('/').Length == expectedPathParts) + .Select(b => new TesInput { - Path = b.Path, - Url = await storageAccessProvider.MapLocalPathToSasUrlAsync(b.Uri.AbsoluteUri, - cancellationToken, getContainerSas: true), - Name = Path.GetFileName(b.Path), + Path = b.Name, + Url = b.Uri.AbsoluteUri, + Name = Path.GetFileName(b.Name), Type = TesFileType.FILEEnum }) - .ToListAsync(cancellationToken); + .ToList(); } } diff --git a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs index 485a2304f..1542a6216 100644 --- a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs @@ -163,7 +163,7 @@ private async Task AddSasTokenAsync(StorageAccountUrl var builder = pathSegments.IsContainer || getContainerSas ? new BlobSasBuilder(sasContainerPermissions, expiresOn) : new BlobSasBuilder(sasBlobPermissions, expiresOn); builder.BlobContainerName = resultPathSegments.ContainerName; - builder.BlobName = resultPathSegments.BlobName; + builder.BlobName = pathSegments.IsContainer || getContainerSas ? string.Empty : resultPathSegments.BlobName; builder.Protocol = SasProtocol.Https; resultPathSegments.SasToken = builder.ToSasQueryParameters(new StorageSharedKeyCredential(storageAccountInfo.Name, accountKey)).ToString(); From a61daa9668f2fbe914ac1680cd011671945b155a Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 17 Oct 2023 01:37:28 -0700 Subject: [PATCH 040/202] Formatting --- nuget.config | 2 +- src/TesApi.Web/Management/BatchQuotaVerifier.cs | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/nuget.config b/nuget.config index 08221cbbc..475fd325c 100644 --- a/nuget.config +++ b/nuget.config @@ -4,4 +4,4 @@ - \ No newline at end of file + diff --git a/src/TesApi.Web/Management/BatchQuotaVerifier.cs b/src/TesApi.Web/Management/BatchQuotaVerifier.cs index df0c14fb6..e11044713 100644 --- a/src/TesApi.Web/Management/BatchQuotaVerifier.cs +++ b/src/TesApi.Web/Management/BatchQuotaVerifier.cs @@ -6,7 +6,6 @@ using System.Threading; using System.Threading.Tasks; using Microsoft.Extensions.Logging; -using Newtonsoft.Json; using Tes.Models; using TesApi.Web.Management.Models.Quotas; From 0cfd7e29becd8faf216cd14eecb22346be932826 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 17 Oct 2023 01:58:08 -0700 Subject: [PATCH 041/202] Update deployer update --- src/deploy-tes-on-azure/Deployer.cs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/deploy-tes-on-azure/Deployer.cs b/src/deploy-tes-on-azure/Deployer.cs index 1a2b7810a..8705f807e 100644 --- a/src/deploy-tes-on-azure/Deployer.cs +++ b/src/deploy-tes-on-azure/Deployer.cs @@ -319,10 +319,7 @@ await Execute("Connecting to Azure Services...", async () => } var settings = ConfigureSettings(managedIdentity.ClientId, aksValues, installedVersion); - - //if (installedVersion is null || installedVersion < new Version(4, 2)) - //{ - //} + var waitForRoleAssignmentPropagation = false; if (installedVersion is null || installedVersion < new Version(4, 4)) { @@ -331,6 +328,7 @@ await Execute("Connecting to Azure Services...", async () => // Always place the compute nodes into the new batch subnet (needed for simplified communication with batch and is faster/cheaper for azure services access). await AssignMIAsNetworkContributorToResourceAsync(managedIdentity, resourceGroup); + waitForRoleAssignmentPropagation = true; if (string.IsNullOrWhiteSpace(settings["BatchNodesSubnetId"])) { @@ -342,12 +340,14 @@ await Execute("Connecting to Azure Services...", async () => { // Storage account now requires Storage Blob Data Owner await AssignVmAsDataOwnerToStorageAccountAsync(managedIdentity, storageAccount); + waitForRoleAssignmentPropagation = true; } - // TODO: (purpetually) update the stated version in the next line to the next-to-last version that performed role assignment changes. - if (installedVersion is null || installedVersion < new Version(4, 4) || - // TODO: (purpetually) update the stated version in the next line to one less than the last version that performs role assignment changes. - (installedVersion < new Version(targetVersion) && installedVersion > new Version(4, 6))) + //if (installedVersion is null || installedVersion < new Version(4, 8)) + //{ + //} + + if (waitForRoleAssignmentPropagation) { ConsoleEx.WriteLine("Waiting 5 minutes for role assignment propagation..."); await Task.Delay(System.TimeSpan.FromMinutes(5)); From d899b3ebe1d1bc2a8fa684b9c26af364d03f4c94 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 17 Oct 2023 02:48:49 -0700 Subject: [PATCH 042/202] Remove unusable pools --- src/TesApi.Web/AzureProxy.cs | 2 +- src/TesApi.Web/BatchPool.cs | 10 +++++----- src/TesApi.Web/BatchScheduler.cs | 3 ++- src/TesApi.Web/IBatchPool.cs | 3 ++- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 57824efe7..9630797e6 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -306,7 +306,7 @@ public IAsyncEnumerable GetActivePoolsAsync(string hostName) var activePoolsFilter = new ODATADetailLevel { FilterClause = "state eq 'active'", - SelectClause = BatchPool.CloudPoolSelectClause + SelectClause = BatchPool.CloudPoolSelectClause + ",identity", }; return batchClient.PoolOperations.ListPools(activePoolsFilter).ToAsyncEnumerable() diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 76183c6cc..dfbd22493 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -607,7 +607,7 @@ await Task.WhenAll( _azureProxy.CreateBatchJobAsync(poolModel.Name, cancellationToken), Task.Run(async () => pool = await _azureProxy.CreateBatchPoolAsync(poolModel, isPreemptible, cancellationToken), cancellationToken)); - Configure(pool); + Configure(pool, false); } catch (AggregateException ex) { @@ -652,7 +652,7 @@ static bool IsInnermostExceptionSocketException(Exception ex) } /// - public async ValueTask AssignPoolAsync(CloudPool pool, CancellationToken cancellationToken) + public async ValueTask AssignPoolAsync(CloudPool pool, bool forceRemove, CancellationToken cancellationToken) { ArgumentNullException.ThrowIfNull(pool); @@ -668,15 +668,15 @@ public async ValueTask AssignPoolAsync(CloudPool pool, CancellationToken cancell throw new InvalidOperationException($"Active Job not found for Pool {pool.Id}"); } - Configure(pool); + Configure(pool, forceRemove); } - private void Configure(CloudPool pool) + private void Configure(CloudPool pool, bool forceRemove) { ArgumentNullException.ThrowIfNull(pool); Id = pool.Id; - IsAvailable = DetermineIsAvailable(pool.CreationTime); + IsAvailable = !forceRemove && DetermineIsAvailable(pool.CreationTime); if (IsAvailable) { diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index d6f8fd2d1..c435c6c50 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -345,8 +345,9 @@ public async Task LoadExistingPoolsAsync(CancellationToken cancellationToken) { try { + var forceRemove = !string.IsNullOrWhiteSpace(globalManagedIdentity) && !(cloudPool.Identity?.UserAssignedIdentities?.Any(id => globalManagedIdentity.Equals(id.ResourceId, StringComparison.OrdinalIgnoreCase)) ?? false); var batchPool = batchPoolFactory.CreateNew(); - await batchPool.AssignPoolAsync(cloudPool, cancellationToken); + await batchPool.AssignPoolAsync(cloudPool, forceRemove, cancellationToken); } catch (Exception exc) { diff --git a/src/TesApi.Web/IBatchPool.cs b/src/TesApi.Web/IBatchPool.cs index f74536eea..7038742ae 100644 --- a/src/TesApi.Web/IBatchPool.cs +++ b/src/TesApi.Web/IBatchPool.cs @@ -36,8 +36,9 @@ public interface IBatchPool /// Connects to the provided pool and associated job in the Batch Account. /// /// The to connect to. + /// /// - ValueTask AssignPoolAsync(CloudPool pool, CancellationToken cancellationToken); + ValueTask AssignPoolAsync(CloudPool pool, bool forceRemove, CancellationToken cancellationToken); /// /// Indicates that the pool is not scheduled to run tasks nor running tasks. From b4e6235cdcc62a5741fc0c6f0f89e89df7034170 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 17 Oct 2023 04:18:01 -0700 Subject: [PATCH 043/202] Log errors that are retried --- .../CacheAndRetryHandlerTest.cs | 2 +- .../TerraLandingZoneApiClientTest.cs | 4 +- .../TerraWsmApiClientTests.cs | 6 +- src/Tes.ApiClients/CachingRetryHandler.cs | 21 +-- src/Tes.ApiClients/RetryHandler.cs | 31 +-- .../ContainerRegistryProviderTests.cs | 6 +- .../TestServices/TestServiceProvider.cs | 1 + .../CachingWithRetriesAzureProxy.cs | 178 +++++++++++++++--- src/TesApi.Web/PagedInterfaceExtensions.cs | 17 +- 9 files changed, 188 insertions(+), 78 deletions(-) diff --git a/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs b/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs index 476512671..19c8e0ac4 100644 --- a/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs +++ b/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs @@ -75,7 +75,7 @@ public async Task ExecuteWithRetryAndCachingAsync_ValueIsNotCachedOnFailureAndTh await Assert.ThrowsExceptionAsync(() => cachingRetryHandler.ExecuteWithRetryAndCachingAsync(cacheKey, _ => Task.Run(() => mockInstanceToRetry.Object.ToString()), System.Threading.CancellationToken.None)); - Assert.IsFalse(appCache.TryGetValue(cacheKey, out string _)); + Assert.IsFalse(appCache.TryGetValue(cacheKey, out string? _)); } [TestMethod] diff --git a/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs b/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs index 646ad4500..e07ca72c0 100644 --- a/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs +++ b/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs @@ -29,7 +29,7 @@ public async Task GetResourceQuotaAsync_ValidResourceIdReturnsQuotaInformationAn { var body = terraApiStubData.GetResourceQuotaApiResponseInJson(); cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAndCachingAsync(It.IsAny(), - It.IsAny>>(), It.IsAny())) + It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(body); var quota = await terraLandingZoneApiClient.GetResourceQuotaAsync(terraApiStubData.LandingZoneId, terraApiStubData.BatchAccountId, cacheResults: true, cancellationToken: CancellationToken.None); @@ -56,7 +56,7 @@ public async Task GetLandingZoneResourcesAsync_ListOfLandingZoneResourcesAndGets var body = terraApiStubData.GetResourceApiResponseInJson(); cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAndCachingAsync(It.IsAny(), - It.IsAny>>(), It.IsAny())) + It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(body); var resources = await terraLandingZoneApiClient.GetLandingZoneResourcesAsync(terraApiStubData.LandingZoneId, CancellationToken.None); diff --git a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs index 8d75ce40d..791d45c3c 100644 --- a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs +++ b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs @@ -89,7 +89,7 @@ public async Task GetSasTokenAsync_ValidRequest_ReturnsPayload() Content = new StringContent(terraApiStubData.GetWsmSasTokenApiResponseInJson()) }; - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny())) + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); var apiResponse = await terraWsmApiClient.GetSasTokenAsync(terraApiStubData.WorkspaceId, @@ -108,7 +108,7 @@ public async Task GetContainerResourcesAsync_ValidRequest_ReturnsPayload() Content = new StringContent(terraApiStubData.GetContainerResourcesApiResponseInJson()) }; - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny())) + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); var apiResponse = await terraWsmApiClient.GetContainerResourcesAsync(terraApiStubData.WorkspaceId, @@ -125,7 +125,7 @@ public async Task DeleteBatchPoolAsync_204Response_Succeeds() var wsmResourceId = Guid.NewGuid(); var response = new HttpResponseMessage(HttpStatusCode.NoContent); - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny())) + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); await terraWsmApiClient.DeleteBatchPoolAsync(terraApiStubData.WorkspaceId, wsmResourceId, CancellationToken.None); diff --git a/src/Tes.ApiClients/CachingRetryHandler.cs b/src/Tes.ApiClients/CachingRetryHandler.cs index 228349964..5bc94e775 100644 --- a/src/Tes.ApiClients/CachingRetryHandler.cs +++ b/src/Tes.ApiClients/CachingRetryHandler.cs @@ -3,6 +3,7 @@ using Microsoft.Extensions.Caching.Memory; using Microsoft.Extensions.Options; +using Polly; using Tes.ApiClients.Options; namespace Tes.ApiClients @@ -43,12 +44,13 @@ protected CachingRetryHandler() { } /// /// Action to execute /// A for controlling the lifetime of the asynchronous operation. + /// /// - public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, CancellationToken cancellationToken) + public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, CancellationToken cancellationToken, Context? context = default) { ValidateArgs(cacheKey, action); - return await ExecuteWithCacheAsync(cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken)); + return await ExecuteWithCacheAsync(cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken, context)); } /// @@ -58,23 +60,14 @@ public virtual async Task ExecuteWithRetryAndCachingAsync(stri /// Action to execute /// /// A for controlling the lifetime of the asynchronous operation. + /// /// /// - public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, DateTimeOffset cachesExpires, CancellationToken cancellationToken) + public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, DateTimeOffset cachesExpires, CancellationToken cancellationToken, Context? context = default) { ValidateArgs(cacheKey, action); - return await ExecuteWithCacheAsync(cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken), cachesExpires); - } - - private static void ValidateArgs(string cacheKey, Func action) - { - ArgumentNullException.ThrowIfNull(action); - - if (string.IsNullOrEmpty(cacheKey)) - { - throw new ArgumentNullException(nameof(cacheKey), "Invalid cache key. The value can't be null or empty"); - } + return await ExecuteWithCacheAsync(cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken, context), cachesExpires); } private static void ValidateArgs(string cacheKey, Func action) diff --git a/src/Tes.ApiClients/RetryHandler.cs b/src/Tes.ApiClients/RetryHandler.cs index 85236ef46..2df8d5acb 100644 --- a/src/Tes.ApiClients/RetryHandler.cs +++ b/src/Tes.ApiClients/RetryHandler.cs @@ -93,37 +93,41 @@ protected RetryHandler() { } /// Executes a delegate with the specified policy. /// /// Action to execute + /// /// Result instance - public void ExecuteWithRetry(Action action) + public void ExecuteWithRetry(Action action, Context? context = default) { ArgumentNullException.ThrowIfNull(action); - retryPolicy.Execute(action); + retryPolicy.Execute(_ => action(), context ?? new()); } /// /// Executes a delegate with the specified policy. /// /// Action to execute + /// + /// Result type /// Result instance - public TResult ExecuteWithRetry(Func action) + public TResult ExecuteWithRetry(Func action, Context? context = default) { ArgumentNullException.ThrowIfNull(action); - return retryPolicy.Execute(action); + return retryPolicy.Execute(_ => action(), context ?? new()); } /// /// Executes a delegate with the specified async policy. /// /// Action to execute + /// /// Result type /// Result instance - public virtual Task ExecuteWithRetryAsync(Func> action) + public virtual Task ExecuteWithRetryAsync(Func> action, Context? context = default) { ArgumentNullException.ThrowIfNull(action); - return asyncRetryPolicy.ExecuteAsync(action); + return asyncRetryPolicy.ExecuteAsync(_ => action(), context ?? new()); } /// @@ -131,13 +135,14 @@ public virtual Task ExecuteWithRetryAsync(Func> /// /// Action to execute /// A for controlling the lifetime of the asynchronous operation. + /// /// Result type /// Result instance - public virtual Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken) + public virtual Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, Context? context = default) { ArgumentNullException.ThrowIfNull(action); - return asyncRetryPolicy.ExecuteAsync(action, cancellationToken); + return asyncRetryPolicy.ExecuteAsync((_, ct) => action(ct), context ?? new(), cancellationToken); } /// @@ -145,12 +150,13 @@ public virtual Task ExecuteWithRetryAsync(Func /// Action to execute /// A for controlling the lifetime of the asynchronous operation. + /// /// Result instance - public async Task ExecuteWithRetryAsync(Func action, CancellationToken cancellationToken) + public async Task ExecuteWithRetryAsync(Func action, CancellationToken cancellationToken, Context? context = default) { ArgumentNullException.ThrowIfNull(action); - await asyncRetryPolicy.ExecuteAsync(action, cancellationToken); + await asyncRetryPolicy.ExecuteAsync((_, ct) => action(ct), context ?? new(), cancellationToken); } /// @@ -158,12 +164,13 @@ public async Task ExecuteWithRetryAsync(Func action, Ca /// /// Action to execute /// A for controlling the lifetime of the asynchronous operation. + /// /// Result HttpResponse - public virtual async Task ExecuteHttpRequestWithRetryAsync(Func> action, CancellationToken cancellationToken) + public virtual async Task ExecuteHttpRequestWithRetryAsync(Func> action, CancellationToken cancellationToken, Context? context = default) { ArgumentNullException.ThrowIfNull(action); - return await asyncHttpRetryPolicy.ExecuteAsync(action, cancellationToken); + return await asyncHttpRetryPolicy.ExecuteAsync((_, ct) => action(ct), context ?? new(), cancellationToken); } } diff --git a/src/TesApi.Tests/ContainerRegistryProviderTests.cs b/src/TesApi.Tests/ContainerRegistryProviderTests.cs index 7c089f35b..3a94ab463 100644 --- a/src/TesApi.Tests/ContainerRegistryProviderTests.cs +++ b/src/TesApi.Tests/ContainerRegistryProviderTests.cs @@ -51,7 +51,7 @@ public async Task GetContainerRegistryInfoAsync_ServerIsAccessible_ReturnsAndAdd var server = "registry.com"; var image = $"{server}/image"; retryHandlerMock.Setup(r => - r.ExecuteWithRetryAsync(It.IsAny>>>(), It.IsAny())) + r.ExecuteWithRetryAsync(It.IsAny>>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(new List() { new ContainerRegistryInfo() { RegistryServer = server } @@ -81,7 +81,7 @@ public async Task GetContainerRegistryInfoAsync_ServerInCache_ReturnsRegistryInf Assert.AreEqual(server, container.RegistryServer); appCacheMock.Verify(c => c.TryGetValue(It.Is(v => $"{nameof(ContainerRegistryProvider)}:{image}".Equals(v)), out It.Ref.IsAny), Times.Once()); retryHandlerMock.Verify(r => - r.ExecuteWithRetryAsync(It.IsAny>>>(), It.IsAny()), Times.Never); + r.ExecuteWithRetryAsync(It.IsAny>>>(), It.IsAny(), It.IsAny()), Times.Never); } [TestMethod] @@ -103,7 +103,7 @@ public async Task GetContainerRegistryInfoAsync_NoAccessibleServerNoServerCached retryHandlerMock.Setup(r => r.ExecuteWithRetryAsync( It.IsAny>>>(), - It.IsAny())) + It.IsAny(), It.IsAny())) .ReturnsAsync(new List() { new ContainerRegistryInfo() { RegistryServer = server } diff --git a/src/TesApi.Tests/TestServices/TestServiceProvider.cs b/src/TesApi.Tests/TestServices/TestServiceProvider.cs index 754ca2036..f5e87d050 100644 --- a/src/TesApi.Tests/TestServices/TestServiceProvider.cs +++ b/src/TesApi.Tests/TestServices/TestServiceProvider.cs @@ -84,6 +84,7 @@ internal TestServiceProvider( .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) + .AddTransient>(_ => NullLogger.Instance) .AddSingleton() .AddSingleton() .AddSingleton() diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 1d3425d8b..b093192b9 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -10,6 +10,7 @@ using Microsoft.Azure.Batch; using Microsoft.Azure.Batch.Common; using Microsoft.Extensions.Caching.Memory; +using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Polly; using Polly.Retry; @@ -25,6 +26,7 @@ namespace TesApi.Web /// public class CachingWithRetriesAzureProxy : IAzureProxy { + private readonly ILogger logger; private readonly IAzureProxy azureProxy; private readonly CachingRetryHandler cachingRetryHandler; private readonly AsyncRetryPolicy batchPoolOrJobCreateOrTaskAddHandler; @@ -35,13 +37,15 @@ public class CachingWithRetriesAzureProxy : IAzureProxy /// /// /// - public CachingWithRetriesAzureProxy(IAzureProxy azureProxy, IOptions retryPolicyOptions, CachingRetryHandler cachingRetryHandler) + /// + public CachingWithRetriesAzureProxy(IAzureProxy azureProxy, IOptions retryPolicyOptions, CachingRetryHandler cachingRetryHandler, ILogger logger) { ArgumentNullException.ThrowIfNull(azureProxy); ArgumentNullException.ThrowIfNull(cachingRetryHandler); this.cachingRetryHandler = cachingRetryHandler; this.azureProxy = azureProxy; + this.logger = logger; var creationErrorFoundCodes = new string[] { @@ -87,53 +91,119 @@ public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, strin } /// - public Task DeleteBatchJobAsync(string jobId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(jobId, ct), cancellationToken); + public Task DeleteBatchJobAsync(string jobId, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying DeleteBatchJobAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(jobId, ct), cancellationToken, ctx); + } /// - public Task DeleteBatchTaskAsync(string tesTaskId, string jobId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(tesTaskId, jobId, ct), cancellationToken); + public Task DeleteBatchTaskAsync(string tesTaskId, string jobId, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying DeleteBatchTaskAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(tesTaskId, jobId, ct), cancellationToken, ctx); + } /// - public Task TerminateBatchTaskAsync(string tesTaskId, string jobId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.TerminateBatchTaskAsync(tesTaskId, jobId, ct), cancellationToken); + public Task TerminateBatchTaskAsync(string tesTaskId, string jobId, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying TerminateBatchTaskAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.TerminateBatchTaskAsync(tesTaskId, jobId, ct), cancellationToken, ctx); + } /// - public Task DeleteBatchPoolAsync(string poolId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchPoolAsync(poolId, ct), cancellationToken); + public Task DeleteBatchPoolAsync(string poolId, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying DeleteBatchPoolAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchPoolAsync(poolId, ct), cancellationToken, ctx); + } /// - public Task GetBatchPoolAsync(string poolId, CancellationToken cancellationToken, DetailLevel detailLevel) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchPoolAsync(poolId, ct, detailLevel), cancellationToken); + public Task GetBatchPoolAsync(string poolId, CancellationToken cancellationToken, DetailLevel detailLevel) + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetBatchPoolAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchPoolAsync(poolId, ct, detailLevel), cancellationToken, ctx); + } /// - public Task GetBatchJobAsync(string jobId, CancellationToken cancellationToken, DetailLevel detailLevel) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAsync(jobId, ct, detailLevel), cancellationToken); + public Task GetBatchJobAsync(string jobId, CancellationToken cancellationToken, DetailLevel detailLevel) + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetBatchJobAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAsync(jobId, ct, detailLevel), cancellationToken, ctx); + } /// public async Task DeleteBatchComputeNodesAsync(string poolId, IEnumerable computeNodes, CancellationToken cancellationToken) { cachingRetryHandler.AppCache.Remove($"{nameof(CachingWithRetriesAzureProxy)}:{poolId}"); - await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchComputeNodesAsync(poolId, computeNodes, ct), cancellationToken); + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying DeleteBatchComputeNodesAsync ({RetryCount}).", retryCount)); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchComputeNodesAsync(poolId, computeNodes, ct), cancellationToken, ctx); } /// - public Task DownloadBlobAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DownloadBlobAsync(blobAbsoluteUri, ct), cancellationToken); + public Task DownloadBlobAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying DownloadBlobAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DownloadBlobAsync(blobAbsoluteUri, ct), cancellationToken, ctx); + } /// - public Task BlobExistsAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.BlobExistsAsync(blobAbsoluteUri, ct), cancellationToken); + public Task BlobExistsAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying BlobExistsAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.BlobExistsAsync(blobAbsoluteUri, ct), cancellationToken, ctx); + } /// - public IAsyncEnumerable GetActivePoolsAsync(string hostName) => cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetActivePoolsAsync(hostName)); + public IAsyncEnumerable GetActivePoolsAsync(string hostName) + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetActivePoolsAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetActivePoolsAsync(hostName), ctx); + } /// - public int GetBatchActiveJobCount() => cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetBatchActiveJobCount()); + public int GetBatchActiveJobCount() + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetBatchActiveJobCount ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActiveJobCount, ctx); + } /// - public IEnumerable GetBatchActiveNodeCountByVmSize() => cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetBatchActiveNodeCountByVmSize()); + public IEnumerable GetBatchActiveNodeCountByVmSize() + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetBatchActiveNodeCountByVmSize ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActiveNodeCountByVmSize, ctx); + } /// - public int GetBatchActivePoolCount() => cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetBatchActivePoolCount()); + public int GetBatchActivePoolCount() + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetBatchActivePoolCount ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActivePoolCount, ctx); + } /// public Task GetStorageAccountKeyAsync(StorageAccountInfo storageAccountInfo, CancellationToken cancellationToken) - => cachingRetryHandler.ExecuteWithRetryAndCachingAsync($"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountInfo.Id}", - ct => azureProxy.GetStorageAccountKeyAsync(storageAccountInfo, ct), DateTimeOffset.Now.AddHours(1), cancellationToken); + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetStorageAccountKeyAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetryAndCachingAsync($"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountInfo.Id}", + ct => azureProxy.GetStorageAccountKeyAsync(storageAccountInfo, ct), DateTimeOffset.Now.AddHours(1), cancellationToken, ctx); + } /// public async Task GetStorageAccountInfoAsync(string storageAccountName, CancellationToken cancellationToken) @@ -142,7 +212,9 @@ public async Task GetStorageAccountInfoAsync(string storageA if (storageAccountInfo is null) { - storageAccountInfo = await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetStorageAccountInfoAsync(storageAccountName, ct), cancellationToken); + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetStorageAccountInfoAsync ({RetryCount}).", retryCount)); + storageAccountInfo = await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetStorageAccountInfoAsync(storageAccountName, ct), cancellationToken, ctx); if (storageAccountInfo is not null) { @@ -154,16 +226,36 @@ public async Task GetStorageAccountInfoAsync(string storageA } /// - public IAsyncEnumerable<(string Name, Uri Uri)> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) => cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListBlobsAsync(directoryUri, cancellationToken), cachingRetryHandler.RetryPolicy); + public IAsyncEnumerable<(string Name, Uri Uri)> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying ListBlobsAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListBlobsAsync(directoryUri, cancellationToken), cachingRetryHandler.RetryPolicy, ctx); + } /// - public Task UploadBlobAsync(Uri blobAbsoluteUri, string content, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobAsync(blobAbsoluteUri, content, ct), cancellationToken); + public Task UploadBlobAsync(Uri blobAbsoluteUri, string content, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying UploadBlobAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobAsync(blobAbsoluteUri, content, ct), cancellationToken); + } /// - public Task UploadBlobFromFileAsync(Uri blobAbsoluteUri, string filePath, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobFromFileAsync(blobAbsoluteUri, filePath, ct), cancellationToken); + public Task UploadBlobFromFileAsync(Uri blobAbsoluteUri, string filePath, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying UploadBlobFromFileAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobFromFileAsync(blobAbsoluteUri, filePath, ct), cancellationToken, ctx); + } /// - public Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBlobPropertiesAsync(blobAbsoluteUri, ct), cancellationToken); + public Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetBlobPropertiesAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBlobPropertiesAsync(blobAbsoluteUri, ct), cancellationToken, ctx); + } /// public bool LocalFileExists(string path) => azureProxy.LocalFileExists(path); @@ -189,16 +281,32 @@ public async Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, boo /// public Task<(AllocationState? AllocationState, bool? AutoScaleEnabled, int? TargetLowPriority, int? CurrentLowPriority, int? TargetDedicated, int? CurrentDedicated)> GetFullAllocationStateAsync(string poolId, CancellationToken cancellationToken) - => cachingRetryHandler.ExecuteWithRetryAndCachingAsync( + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetFullAllocationStateAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetryAndCachingAsync( $"{nameof(CachingWithRetriesAzureProxy)}:{poolId}", ct => azureProxy.GetFullAllocationStateAsync(poolId, ct), - DateTimeOffset.Now.Add(BatchPoolService.RunInterval).Subtract(TimeSpan.FromSeconds(1)), cancellationToken); + DateTimeOffset.Now.Add(BatchPoolService.RunInterval).Subtract(TimeSpan.FromSeconds(1)), + cancellationToken, + ctx); + } /// - public IAsyncEnumerable ListComputeNodesAsync(string poolId, DetailLevel detailLevel) => cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListComputeNodesAsync(poolId, detailLevel), cachingRetryHandler.RetryPolicy); + public IAsyncEnumerable ListComputeNodesAsync(string poolId, DetailLevel detailLevel) + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying ListComputeNodesAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListComputeNodesAsync(poolId, detailLevel), cachingRetryHandler.RetryPolicy, ctx); + } /// - public IAsyncEnumerable ListTasksAsync(string jobId, DetailLevel detailLevel) => cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListTasksAsync(jobId, detailLevel), cachingRetryHandler.RetryPolicy); + public IAsyncEnumerable ListTasksAsync(string jobId, DetailLevel detailLevel) + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying ListTasksAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListTasksAsync(jobId, detailLevel), cachingRetryHandler.RetryPolicy, ctx); + } /// public Task DisableBatchPoolAutoScaleAsync(string poolId, CancellationToken cancellationToken) => azureProxy.DisableBatchPoolAutoScaleAsync(poolId, cancellationToken); @@ -208,13 +316,27 @@ public Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeS => azureProxy.EnableBatchPoolAutoScaleAsync(poolId, preemptable, interval, formulaFactory, currentTargetFunc, cancellationToken); /// - public Task EvaluateAutoScaleAsync(string poolId, string autoscaleFormula, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.EvaluateAutoScaleAsync(poolId, autoscaleFormula, ct), cancellationToken); + public Task EvaluateAutoScaleAsync(string poolId, string autoscaleFormula, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying EvaluateAutoScaleAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.EvaluateAutoScaleAsync(poolId, autoscaleFormula, ct), cancellationToken, ctx); + } /// public IAsyncEnumerable ListBlobsWithTagsAsync(Uri containerUri, string prefix, CancellationToken cancellationToken) - => cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListBlobsWithTagsAsync(containerUri, prefix, cancellationToken), cachingRetryHandler.RetryPolicy); + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying ListBlobsWithTagsAsync ({RetryCount}).", retryCount)); + return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListBlobsWithTagsAsync(containerUri, prefix, cancellationToken), cachingRetryHandler.RetryPolicy, ctx); + } /// - public Task SetBlobTags(Uri blobAbsoluteUri, IDictionary tags, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.SetBlobTags(blobAbsoluteUri, tags, ct), cancellationToken); + public Task SetBlobTags(Uri blobAbsoluteUri, IDictionary tags, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying SetBlobTags ({RetryCount}).", retryCount)); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.SetBlobTags(blobAbsoluteUri, tags, ct), cancellationToken, ctx); + } } } diff --git a/src/TesApi.Web/PagedInterfaceExtensions.cs b/src/TesApi.Web/PagedInterfaceExtensions.cs index 3acf1b865..297c2ab91 100644 --- a/src/TesApi.Web/PagedInterfaceExtensions.cs +++ b/src/TesApi.Web/PagedInterfaceExtensions.cs @@ -50,19 +50,6 @@ public static IAsyncEnumerable ToAsyncEnumerable(this IPagedCollection public static IAsyncEnumerable ToAsyncEnumerable(this IPage source, Func>> nextPageFunc) => new AsyncEnumerable(source, nextPageFunc); - /// - /// Adapts calls returning to . - /// - /// Type of results returned in by . - /// Policy retrying calls made while enumerating results returned by . - /// Method returning . - /// Policy retrying call to . - /// - public static IAsyncEnumerable ExecuteAsync(this AsyncRetryPolicy asyncRetryPolicy, Func> func, RetryPolicy retryPolicy) - { - return asyncRetryPolicy.ExecuteAsync(_ => func(), retryPolicy, new()); - } - /// /// Adapts calls returning to . /// @@ -72,13 +59,13 @@ public static IAsyncEnumerable ExecuteAsync(this AsyncRetryPolicy asyncRet /// Policy retrying call to . /// An optional . /// - public static IAsyncEnumerable ExecuteAsync(this AsyncRetryPolicy asyncRetryPolicy, Func> func, RetryPolicy retryPolicy, Polly.Context ctx) + public static IAsyncEnumerable ExecuteAsync(this AsyncRetryPolicy asyncRetryPolicy, Func> func, RetryPolicy retryPolicy, Polly.Context ctx = default) { ArgumentNullException.ThrowIfNull(asyncRetryPolicy); ArgumentNullException.ThrowIfNull(func); ArgumentNullException.ThrowIfNull(retryPolicy); - return new PollyAsyncEnumerable(retryPolicy.Execute(ctx => func(ctx), ctx ??= new()), asyncRetryPolicy, ctx); + return new PollyAsyncEnumerable(retryPolicy.Execute(_ => func(), ctx ??= new()), asyncRetryPolicy, ctx); } #region Implementation classes From 314740cc6eadee20f3837e22cf3a3fe074fe02d2 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 18 Oct 2023 09:58:57 -0700 Subject: [PATCH 044/202] Implement remainder of events --- src/Tes.Runner/Events/EventSink.cs | 8 - .../TestServices/TestServiceProvider.cs | 9 +- src/TesApi.Web/AzureBatchTaskState.cs | 17 +- src/TesApi.Web/AzureProxy.cs | 1 + src/TesApi.Web/BatchPool.cs | 2 +- src/TesApi.Web/BatchPoolFactory.cs | 28 --- src/TesApi.Web/BatchPools.BatchScheduler.cs | 2 +- src/TesApi.Web/BatchScheduler.cs | 63 ++--- .../CachingWithRetriesAzureProxy.cs | 1 + src/TesApi.Web/Events/NodeEventMessage.cs | 229 ++++++++++++++++++ .../OperationCancelledExceptionFilter.cs | 2 +- .../PagedInterfaceExtensions.cs | 2 +- src/TesApi.Web/IBatchPoolFactory.cs | 17 -- src/TesApi.Web/IBatchScheduler.cs | 2 +- .../ArmResourceInformationFinder.cs | 1 + src/TesApi.Web/Scheduler.cs | 21 +- src/TesApi.Web/Startup.cs | 9 +- src/TesApi.Web/TesEventMessage.cs | 180 -------------- 18 files changed, 303 insertions(+), 291 deletions(-) delete mode 100644 src/TesApi.Web/BatchPoolFactory.cs create mode 100644 src/TesApi.Web/Events/NodeEventMessage.cs rename src/TesApi.Web/{Controllers => Exceptions}/OperationCancelledExceptionFilter.cs (98%) rename src/TesApi.Web/{ => Extensions}/PagedInterfaceExtensions.cs (99%) delete mode 100644 src/TesApi.Web/IBatchPoolFactory.cs delete mode 100644 src/TesApi.Web/TesEventMessage.cs diff --git a/src/Tes.Runner/Events/EventSink.cs b/src/Tes.Runner/Events/EventSink.cs index 7e4aeea37..3a61957fc 100644 --- a/src/Tes.Runner/Events/EventSink.cs +++ b/src/Tes.Runner/Events/EventSink.cs @@ -59,14 +59,6 @@ protected static IDictionary ToEventTag(EventMessage eventMessag { "workflow-id", eventMessage.CorrelationId }, { "event-name", eventMessage.Name }, { "created", eventMessage.Created.ToString(Iso8601DateFormat) } - - //{ "event_name", eventMessage.Name }, - //{ "event_id", eventMessage.Id }, - //{ "entity_type", eventMessage.EntityType }, - //{ "task_id", eventMessage.EntityId }, - //{ "workflow_id", eventMessage.CorrelationId }, - ////format date to ISO 8601, which is URL friendly - //{ "created", eventMessage.Created.ToString(Iso8601DateFormat) } }; } diff --git a/src/TesApi.Tests/TestServices/TestServiceProvider.cs b/src/TesApi.Tests/TestServices/TestServiceProvider.cs index f5e87d050..deb48ad4f 100644 --- a/src/TesApi.Tests/TestServices/TestServiceProvider.cs +++ b/src/TesApi.Tests/TestServices/TestServiceProvider.cs @@ -18,6 +18,7 @@ using Tes.Models; using Tes.Repository; using TesApi.Web; +using TesApi.Web.Events; using TesApi.Web.Management; using TesApi.Web.Management.Configuration; using TesApi.Web.Options; @@ -83,15 +84,15 @@ internal TestServiceProvider( .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) - .AddTransient>(_ => NullLogger.Instance) + .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) .AddSingleton() .AddSingleton() .AddSingleton() - .AddSingleton() + .AddSingleton>(s => () => s.GetService()) .AddTransient() - .AddSingleton() - .AddTransient() + .AddSingleton, string, NodeEventMessage>>(s => (blobAbsoluteUri, tags, @event) => ActivatorUtilities.CreateInstance(s, blobAbsoluteUri, tags, @event)) + .AddTransient() .AddSingleton() .AddSingleton(s => GetArmBatchQuotaProvider(s, armBatchQuotaProvider)) //added so config utils gets the arm implementation, to be removed once config utils is refactored. .AddSingleton() diff --git a/src/TesApi.Web/AzureBatchTaskState.cs b/src/TesApi.Web/AzureBatchTaskState.cs index 162059598..a394d429f 100644 --- a/src/TesApi.Web/AzureBatchTaskState.cs +++ b/src/TesApi.Web/AzureBatchTaskState.cs @@ -24,6 +24,16 @@ public record AzureBatchTaskState(TaskState State, IEnumerable Ou /// public enum TaskState { + /// + /// The event does not represent any change in task state. + /// + NoChange, + + /// + /// The event provides metadata without changing the task's state. + /// + InfoUpdate, + /// /// A request has been made for the task's cancellation. /// @@ -72,7 +82,12 @@ public enum TaskState /// /// Node failed during startup or task execution (for example, ContainerInvalidImage, DiskFull) /// - NodeFailedDuringStartupOrExecution + NodeFailedDuringStartupOrExecution, + + /// + /// Node failed during upload or download + /// + NodeFilesUploadOrDownloadFailed, } /// diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 9630797e6..82ce96586 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -25,6 +25,7 @@ using Microsoft.WindowsAzure.Storage.Blob; using Polly; using Polly.Retry; +using TesApi.Web.Extensions; using TesApi.Web.Management.Batch; using TesApi.Web.Management.Configuration; using TesApi.Web.Storage; diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index dfbd22493..e9b5e06c7 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -564,7 +564,7 @@ TaskFailureInformation PopNextStartTaskFailure() /// public IAsyncEnumerable GetCompletedTasks(CancellationToken _1) - => GetTasksAsync("id,executionInfo", "state eq 'completed'"); + => GetTasksAsync("id,executionInfo", $"state eq 'completed' and stateTransitionTime lt {DateTime.UtcNow - TimeSpan.FromSeconds(30):O}"); // Returns false when pool/job was removed because it was not found. Returns true if the error was completely something else. private async ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken cancellationToken) diff --git a/src/TesApi.Web/BatchPoolFactory.cs b/src/TesApi.Web/BatchPoolFactory.cs deleted file mode 100644 index 5b5f5a5c8..000000000 --- a/src/TesApi.Web/BatchPoolFactory.cs +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System; -using Microsoft.Extensions.DependencyInjection; - -namespace TesApi.Web -{ - // TODO: Consider moving this class's implementation to Startup - /// - /// Factory to create BatchPool instances. - /// - public sealed class BatchPoolFactory : IBatchPoolFactory - { - private readonly IServiceProvider _serviceProvider; - - /// - /// Constructor for . - /// - /// A service object. - public BatchPoolFactory(IServiceProvider serviceProvider) - => _serviceProvider = serviceProvider; - - /// - public IBatchPool CreateNew() - => _serviceProvider.GetService(); - } -} diff --git a/src/TesApi.Web/BatchPools.BatchScheduler.cs b/src/TesApi.Web/BatchPools.BatchScheduler.cs index 3cdf55b75..00eb4129b 100644 --- a/src/TesApi.Web/BatchPools.BatchScheduler.cs +++ b/src/TesApi.Web/BatchPools.BatchScheduler.cs @@ -153,7 +153,7 @@ internal async Task GetOrAddPoolAsync(string key, bool isPreemptable modelPool.Metadata ??= new List(); modelPool.Metadata.Add(new(PoolHostName, this.batchPrefix)); modelPool.Metadata.Add(new(PoolIsDedicated, (!isPreemptable).ToString())); - var batchPool = batchPoolFactory.CreateNew(); + var batchPool = batchPoolFactory(); await batchPool.CreatePoolAndJobAsync(modelPool, isPreemptable, cancellationToken); pool = batchPool; } diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index c435c6c50..7533e7cb0 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -15,6 +15,7 @@ using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Tes.Extensions; +using TesApi.Web.Events; using TesApi.Web.Extensions; using TesApi.Web.Management; using TesApi.Web.Management.Models.Quotas; @@ -72,31 +73,31 @@ public partial class BatchScheduler : IBatchScheduler private readonly string globalManagedIdentity; private readonly ContainerRegistryProvider containerRegistryProvider; private readonly string batchPrefix; - private readonly IBatchPoolFactory batchPoolFactory; - private readonly BatchTesEventMessageFactory batchTesEventMessageFactory; + private readonly Func batchPoolFactory; + private readonly Func, string, NodeEventMessage> batchTesEventMessageFactory; private readonly IAllowedVmSizesService allowedVmSizesService; private readonly TaskExecutionScriptingManager taskExecutionScriptingManager; /// /// Constructor for /// - /// Logger - /// Configuration of - /// Configuration of - /// Configuration of - /// Configuration of - /// Configuration of - /// Configuration of - /// Configuration of - /// Azure proxy - /// Storage access provider - /// Quota verifier > - /// Sku information provider - /// Container registry information - /// Batch pool factory + /// Logger . + /// Configuration of . + /// Configuration of . + /// Configuration of . + /// Configuration of . + /// Configuration of . + /// Configuration of . + /// Configuration of . + /// Azure proxy . + /// Storage access provider . + /// Quota verifier . + /// Sku information provider . + /// Container registry information . + /// factory. /// Service to get allowed vm sizes. - /// - /// + /// . + /// factory. public BatchScheduler( ILogger logger, IOptions batchGen1Options, @@ -111,10 +112,10 @@ public BatchScheduler( IBatchQuotaVerifier quotaVerifier, IBatchSkuInformationProvider skuInformationProvider, ContainerRegistryProvider containerRegistryProvider, - IBatchPoolFactory poolFactory, + Func poolFactory, IAllowedVmSizesService allowedVmSizesService, TaskExecutionScriptingManager taskExecutionScriptingManager, - BatchTesEventMessageFactory batchTesEventMessageFactory) + Func, string, NodeEventMessage> batchTesEventMessageFactory) { ArgumentNullException.ThrowIfNull(logger); ArgumentNullException.ThrowIfNull(azureProxy); @@ -194,9 +195,9 @@ async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, Comb tesTaskLog.BatchNodeMetrics = batchNodeMetrics; tesTaskLog.CromwellResultCode = cromwellRcCode; tesTaskLog.EndTime = DateTime.UtcNow; - tesTaskExecutorLog.StartTime = taskStartTime ?? batchInfo.BatchTaskStartTime; - tesTaskExecutorLog.EndTime = taskEndTime ?? batchInfo.BatchTaskEndTime; - tesTaskExecutorLog.ExitCode = batchInfo.BatchTaskExitCode; + tesTaskExecutorLog.StartTime ??= taskStartTime ?? batchInfo.BatchTaskStartTime; + tesTaskExecutorLog.EndTime ??= taskEndTime ?? batchInfo.BatchTaskEndTime; + tesTaskExecutorLog.ExitCode ??= batchInfo.BatchTaskExitCode; // Only accurate when the task completes successfully, otherwise it's the Batch time as reported from Batch // TODO this could get large; why? @@ -261,6 +262,11 @@ Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batch return Task.FromResult(false); } + Task HandleInfoUpdate(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) + { + return SetTaskStateAndLog(tesTask, tesTask.State, batchInfo, cancellationToken); + } + tesTaskStateTransitions = new List() { new TesTaskStateTransition(tesTaskDeletionReady, batchTaskState: null, alternateSystemLogItem: null, (tesTask, _, ct) => DeleteCancelledTaskAsync(tesTask, ct)), @@ -278,7 +284,8 @@ Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batch new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.NodeUnusable, "Please open an issue. There should have been an error reported here.", SetTaskExecutorError), //new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.JobNotFound, BatchTaskState.JobNotFound.ToString(), SetTaskSystemError), //new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.MissingBatchTask, BatchTaskState.MissingBatchTask.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), - new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodePreempted, alternateSystemLogItem: null, HandlePreemptedNodeAsync) + new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodePreempted, alternateSystemLogItem: null, HandlePreemptedNodeAsync), + new TesTaskStateTransition(condition: null, AzureBatchTaskState.TaskState.InfoUpdate, alternateSystemLogItem: null, HandleInfoUpdate) }.AsReadOnly(); } @@ -346,7 +353,7 @@ public async Task LoadExistingPoolsAsync(CancellationToken cancellationToken) try { var forceRemove = !string.IsNullOrWhiteSpace(globalManagedIdentity) && !(cloudPool.Identity?.UserAssignedIdentities?.Any(id => globalManagedIdentity.Equals(id.ResourceId, StringComparison.OrdinalIgnoreCase)) ?? false); - var batchPool = batchPoolFactory.CreateNew(); + var batchPool = batchPoolFactory(); await batchPool.AssignPoolAsync(cloudPool, forceRemove, cancellationToken); } catch (Exception exc) @@ -1323,7 +1330,7 @@ private static Dictionary DelimitedTextToDictionary(string text, /// - public async IAsyncEnumerable GetEventMessagesAsync([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken, string @event) + public async IAsyncEnumerable GetEventMessagesAsync([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken, string @event) { const string eventsFolderName = "events"; var prefix = eventsFolderName + "/"; @@ -1339,7 +1346,7 @@ public async IAsyncEnumerable GetEventMessagesAsync([System.Run await foreach (var blobItem in azureProxy.ListBlobsWithTagsAsync(new(await storageAccessProvider.GetInternalTesBlobUrlAsync(string.Empty, cancellationToken, needsTags: true)), prefix, cancellationToken).WithCancellation(cancellationToken)) { - if (blobItem.Tags.ContainsKey(TesEventMessage.ProcessedTag) || !blobItem.Tags.ContainsKey("task-id")) + if (blobItem.Tags.ContainsKey(NodeEventMessage.ProcessedTag) || !blobItem.Tags.ContainsKey("task-id")) { continue; } @@ -1349,7 +1356,7 @@ public async IAsyncEnumerable GetEventMessagesAsync([System.Run var pathFromEventName = blobItem.Name[eventsEndIndex..]; var eventName = pathFromEventName[..pathFromEventName.IndexOf('/')]; - yield return batchTesEventMessageFactory.CreateNew(new(blobUrl), blobItem.Tags, eventName); + yield return batchTesEventMessageFactory(new(blobUrl), blobItem.Tags, eventName); } } diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index b093192b9..15df05886 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -16,6 +16,7 @@ using Polly.Retry; using Tes.ApiClients; using Tes.ApiClients.Options; +using TesApi.Web.Extensions; using TesApi.Web.Storage; using BatchModels = Microsoft.Azure.Management.Batch.Models; diff --git a/src/TesApi.Web/Events/NodeEventMessage.cs b/src/TesApi.Web/Events/NodeEventMessage.cs new file mode 100644 index 000000000..8f5520ad1 --- /dev/null +++ b/src/TesApi.Web/Events/NodeEventMessage.cs @@ -0,0 +1,229 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Extensions.Logging; +using TesApi.Web.Storage; + +namespace TesApi.Web.Events +{ + /// + /// Represents the events sent by the node task runner. + /// + public class NodeEventMessage + { + /// + /// Blob tag used to record event processing. + /// + public const string ProcessedTag = "processed"; + + private readonly IStorageAccessProvider _storageAccessProvider; + private readonly IAzureProxy _azureProxy; + private readonly ILogger _logger; + private readonly Uri _uri; + + /// + /// Tags of this event message. + /// + public IDictionary Tags { get; } + + /// + /// Event of this event message. + /// + public string Event { get; } + + /// + /// Constructor of . + /// + /// + /// + /// + /// + /// + /// + public NodeEventMessage(IAzureProxy azureProxy, ILogger logger, IStorageAccessProvider storageAccessProvider, Uri blobAbsoluteUri, IDictionary tags, string @event) + { + ArgumentNullException.ThrowIfNull(azureProxy); + ArgumentNullException.ThrowIfNull(storageAccessProvider); + ArgumentNullException.ThrowIfNull(blobAbsoluteUri); + ArgumentNullException.ThrowIfNull(tags); + ArgumentNullException.ThrowIfNull(@event); + + if (tags.Count == 0) + { + throw new ArgumentException("This message has no tags.", nameof(tags)); + } + + if (tags.ContainsKey(ProcessedTag)) + { + throw new ArgumentException("This message was already processed.", nameof(tags)); + } + + // There are up to 10 tags allowed. We will be adding one. + // https://learn.microsoft.com/azure/storage/blobs/storage-manage-find-blobs?tabs=azure-portal#setting-blob-index-tags + if (tags.Count > 9) + { + throw new ArgumentException("This message does not have space to add the processed tag.", nameof(tags)); + } + + _azureProxy = azureProxy; + _logger = logger; + _storageAccessProvider = storageAccessProvider; + _uri = blobAbsoluteUri; + Tags = tags.AsReadOnly(); + Event = @event; + } + + /// + /// Gets the details of this event message. + /// + /// + /// + public async Task<(string Id, AzureBatchTaskState State)> GetMessageBatchStateAsync(CancellationToken cancellationToken) + { + Tes.Runner.Events.EventMessage result = null; + + try + { + var messageText = await _azureProxy.DownloadBlobAsync(_uri, cancellationToken); + result = System.Text.Json.JsonSerializer.Deserialize(messageText) + ?? throw new InvalidOperationException("Deserialize() returned null."); + } + catch (Exception ex) + { + throw new InvalidOperationException($"Event message blob is malformed. {ex.GetType().FullName}:{ex.Message}", ex); + } + + System.Diagnostics.Debug.Assert(Guid.TryParse(result.Id, out _)); + System.Diagnostics.Debug.Assert(Tes.Runner.Events.EventsPublisher.EventVersion.Equals(result.EventVersion, StringComparison.Ordinal)); + System.Diagnostics.Debug.Assert(Tes.Runner.Events.EventsPublisher.EventDataVersion.Equals(result.EventDataVersion, StringComparison.Ordinal)); + System.Diagnostics.Debug.Assert(Tes.Runner.Events.EventsPublisher.TesTaskRunnerEntityType.Equals(result.EntityType, StringComparison.Ordinal)); + System.Diagnostics.Debug.Assert(Event.Equals(result.Name, StringComparison.Ordinal)); + + // Event type specific validations + switch (result.Name) + { + case Tes.Runner.Events.EventsPublisher.DownloadStartEvent: + System.Diagnostics.Debug.Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(result.StatusMessage, StringComparison.Ordinal)); + break; + + case Tes.Runner.Events.EventsPublisher.DownloadEndEvent: + System.Diagnostics.Debug.Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage)); + break; + + case Tes.Runner.Events.EventsPublisher.UploadStartEvent: + System.Diagnostics.Debug.Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(result.StatusMessage, StringComparison.Ordinal)); + break; + + case Tes.Runner.Events.EventsPublisher.UploadEndEvent: + System.Diagnostics.Debug.Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage)); + break; + + case Tes.Runner.Events.EventsPublisher.ExecutorStartEvent: + System.Diagnostics.Debug.Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(result.StatusMessage, StringComparison.Ordinal)); + break; + + case Tes.Runner.Events.EventsPublisher.ExecutorEndEvent: + System.Diagnostics.Debug.Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage)); + break; + + case Tes.Runner.Events.EventsPublisher.TaskCompletionEvent: + System.Diagnostics.Debug.Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage)); + break; + + default: + System.Diagnostics.Debug.Assert(false); + break; + } + + _logger.LogDebug("Getting batch task state from event {EventName} for {TesTask}.", result.Name ?? Event, result.EntityId); + return (result.EntityId, GetBatchTaskState(result)); + } + + /// + /// Marks this event message processed. + /// + /// + /// + public async Task MarkMessageProcessed(CancellationToken cancellationToken) + { + await _azureProxy.SetBlobTags( + _uri, + Tags + .Append(new(ProcessedTag, DateTime.UtcNow.ToString("O"))) + .ToDictionary(pair => pair.Key, pair => pair.Value), + cancellationToken); + } + + private /*static*/ AzureBatchTaskState GetBatchTaskState(Tes.Runner.Events.EventMessage message) + { + return (message.Name ?? Event) switch + { + Tes.Runner.Events.EventsPublisher.DownloadStartEvent => new(AzureBatchTaskState.TaskState.NoChange), + + Tes.Runner.Events.EventsPublisher.DownloadEndEvent => string.IsNullOrWhiteSpace(message.EventData["errorMessage"]) + + ? new( + AzureBatchTaskState.TaskState.NoChange) + + : new( + AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed, + Failure: new("SystemError", + Enumerable.Empty() + .Append(message.EventData["errorMessage"]))), + + Tes.Runner.Events.EventsPublisher.ExecutorStartEvent => new(AzureBatchTaskState.TaskState.Running, BatchTaskStartTime: message.Created), + + Tes.Runner.Events.EventsPublisher.ExecutorEndEvent => string.IsNullOrWhiteSpace(message.EventData["errorMessage"]) + + ? new( + AzureBatchTaskState.TaskState.InfoUpdate, + BatchTaskEndTime: message.Created, + BatchTaskExitCode: int.Parse(message.EventData["exitCode"])) + + : new( + AzureBatchTaskState.TaskState.InfoUpdate, + Failure: new("ExecutorError", + Enumerable.Empty() + .Append(message.EventData["errorMessage"])), + BatchTaskEndTime: message.Created, + BatchTaskExitCode: int.Parse(message.EventData["exitCode"])), + + Tes.Runner.Events.EventsPublisher.UploadStartEvent => new(AzureBatchTaskState.TaskState.NoChange), + + Tes.Runner.Events.EventsPublisher.UploadEndEvent => string.IsNullOrWhiteSpace(message.EventData["errorMessage"]) + + ? new( + AzureBatchTaskState.TaskState.NoChange) + + : new( + AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed, + Failure: new("SystemError", + Enumerable.Empty() + .Append(message.EventData["errorMessage"]))), // TODO + + Tes.Runner.Events.EventsPublisher.TaskCompletionEvent => string.IsNullOrWhiteSpace(message.EventData["errorMessage"]) + + ? new( + AzureBatchTaskState.TaskState.CompletedSuccessfully, + BatchTaskStartTime: message.Created - TimeSpan.Parse(message.EventData["duration"]), + BatchTaskEndTime: message.Created/*, + BatchTaskExitCode: 0*/) + + : new( + AzureBatchTaskState.TaskState.CompletedWithErrors, + Failure: new("ExecutorError", + Enumerable.Empty() + .Append(message.EventData["errorMessage"])), + BatchTaskStartTime: message.Created - TimeSpan.Parse(message.EventData["duration"]), + BatchTaskEndTime: message.Created), + + _ => throw new System.Diagnostics.UnreachableException(), + }; + } + } +} diff --git a/src/TesApi.Web/Controllers/OperationCancelledExceptionFilter.cs b/src/TesApi.Web/Exceptions/OperationCancelledExceptionFilter.cs similarity index 98% rename from src/TesApi.Web/Controllers/OperationCancelledExceptionFilter.cs rename to src/TesApi.Web/Exceptions/OperationCancelledExceptionFilter.cs index 1b9c5390c..18b3ed117 100644 --- a/src/TesApi.Web/Controllers/OperationCancelledExceptionFilter.cs +++ b/src/TesApi.Web/Exceptions/OperationCancelledExceptionFilter.cs @@ -6,7 +6,7 @@ using Microsoft.AspNetCore.Mvc.Filters; using Microsoft.Extensions.Logging; -namespace TesApi.Web.Controllers +namespace TesApi.Web { /// /// Global filter that handles . diff --git a/src/TesApi.Web/PagedInterfaceExtensions.cs b/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs similarity index 99% rename from src/TesApi.Web/PagedInterfaceExtensions.cs rename to src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs index 297c2ab91..541ed3af5 100644 --- a/src/TesApi.Web/PagedInterfaceExtensions.cs +++ b/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs @@ -11,7 +11,7 @@ using Polly.Retry; // TODO: move this to Common.csproj? -namespace TesApi.Web +namespace TesApi.Web.Extensions { /// /// Extension methods and implementations for enumerating paged enumeration/collection types from Azure diff --git a/src/TesApi.Web/IBatchPoolFactory.cs b/src/TesApi.Web/IBatchPoolFactory.cs deleted file mode 100644 index b101d1e7e..000000000 --- a/src/TesApi.Web/IBatchPoolFactory.cs +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -namespace TesApi.Web -{ - /// - /// Factory to create BatchPool instances. - /// - public interface IBatchPoolFactory - { - /// - /// Creates instances. - /// - /// - IBatchPool CreateNew(); - } -} diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index 92a50944b..e8233a05f 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -108,6 +108,6 @@ public interface IBatchScheduler /// A for controlling the lifetime of the asynchronous operation. /// Optional event to retrieve. Defaults to all events. /// - IAsyncEnumerable GetEventMessagesAsync(CancellationToken cancellationToken, string @event = default); + IAsyncEnumerable GetEventMessagesAsync(CancellationToken cancellationToken, string @event = default); } } diff --git a/src/TesApi.Web/Management/ArmResourceInformationFinder.cs b/src/TesApi.Web/Management/ArmResourceInformationFinder.cs index 1ec0ca972..e047747eb 100644 --- a/src/TesApi.Web/Management/ArmResourceInformationFinder.cs +++ b/src/TesApi.Web/Management/ArmResourceInformationFinder.cs @@ -9,6 +9,7 @@ using Microsoft.Azure.Management.Batch; using Microsoft.Azure.Services.AppAuthentication; using Microsoft.Rest; +using TesApi.Web.Extensions; namespace TesApi.Web.Management { diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index 0c77b5d7a..c4d885a60 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -10,6 +10,7 @@ using Microsoft.Extensions.Logging; using Tes.Models; using Tes.Repository; +using TesApi.Web.Events; namespace TesApi.Web { @@ -157,12 +158,11 @@ private Task ExecuteUpdateTesTaskFromEventBlobAsync(CancellationToken stoppingTo /// async ValueTask UpdateTesTasksFromEventBlobsAsync(CancellationToken stoppingToken) { - var messageInfos = new ConcurrentBag(); + var messageInfos = new ConcurrentBag(); var messages = new ConcurrentBag<(string Id, AzureBatchTaskState State)>(); // Get and parse event blobs - await foreach (var message in batchScheduler.GetEventMessagesAsync(stoppingToken, Tes.Runner.Events.EventsPublisher.TaskCompletionEvent) - .Concat(batchScheduler.GetEventMessagesAsync(stoppingToken, Tes.Runner.Events.EventsPublisher.ExecutorStartEvent)) + await foreach (var message in batchScheduler.GetEventMessagesAsync(stoppingToken) .WithCancellation(stoppingToken)) { messageInfos.Add(message); @@ -184,20 +184,9 @@ await OrchestrateTesTasksOnBatchAsync( stoppingToken); // Helpers - async ValueTask ProcessMessage(TesEventMessage messageInfo, CancellationToken cancellationToken) + async ValueTask ProcessMessage(NodeEventMessage messageInfo, CancellationToken cancellationToken) { - // TODO: remove the switch (keeping the message state retrieval) when TesEventMessage.GetMessageBatchStateAsync() can process all events - switch (messageInfo.Event) - { - case Tes.Runner.Events.EventsPublisher.ExecutorStartEvent: - case Tes.Runner.Events.EventsPublisher.TaskCompletionEvent: - messages.Add(await messageInfo.GetMessageBatchStateAsync(cancellationToken)); - break; - - default: - break; - } - + messages.Add(await messageInfo.GetMessageBatchStateAsync(cancellationToken)); await messageInfo.MarkMessageProcessed(cancellationToken); } diff --git a/src/TesApi.Web/Startup.cs b/src/TesApi.Web/Startup.cs index 3065dbe63..2ced4bf44 100644 --- a/src/TesApi.Web/Startup.cs +++ b/src/TesApi.Web/Startup.cs @@ -2,6 +2,7 @@ // Licensed under the MIT License. using System; +using System.Collections.Generic; using System.IO; using System.Reflection; using Azure.Core; @@ -80,13 +81,13 @@ public void ConfigureServices(IServiceCollection services) .AddSingleton, TesRepositoryCache>() .AddSingleton() .AddSingleton() - .AddTransient() - .AddSingleton() + .AddTransient() + .AddSingleton, string, Events.NodeEventMessage>>(s => (blobAbsoluteUri, tags, @event) => ActivatorUtilities.CreateInstance(s, blobAbsoluteUri, tags, @event)) .AddTransient() - .AddSingleton() + .AddSingleton>(services => () => services.GetService()) .AddSingleton(CreateBatchPoolManagerFromConfiguration) - .AddControllers(options => options.Filters.Add()) + .AddControllers(options => options.Filters.Add()) .AddNewtonsoftJson(opts => { opts.SerializerSettings.ContractResolver = new CamelCasePropertyNamesContractResolver(); diff --git a/src/TesApi.Web/TesEventMessage.cs b/src/TesApi.Web/TesEventMessage.cs deleted file mode 100644 index 30cad76d9..000000000 --- a/src/TesApi.Web/TesEventMessage.cs +++ /dev/null @@ -1,180 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Threading; -using System.Threading.Tasks; -using Microsoft.Extensions.DependencyInjection; -using Microsoft.Extensions.Logging; -using TesApi.Web.Storage; - -namespace TesApi.Web -{ - /// - /// Represents the events sent by the node task runner. - /// - /// This should be transient in DI. - public class TesEventMessage - { - static TesEventMessage() => Tes.Utilities.NewtonsoftJsonSafeInit.SetDefaultSettings(); - - /// - /// Blob tag used to record event processing. - /// - public const string ProcessedTag = "processed"; - - private readonly IStorageAccessProvider _storageAccessProvider; - private readonly IAzureProxy _azureProxy; - private readonly ILogger _logger; - private readonly Uri _uri; - - /// - /// Tags of this event message. - /// - public IDictionary Tags { get; } - - /// - /// Event of this event message. - /// - public string Event { get; } - - /// - /// Constructor of . - /// - /// - /// - /// - /// - /// - /// - public TesEventMessage(IAzureProxy azureProxy, ILogger logger, IStorageAccessProvider storageAccessProvider, Uri blobAbsoluteUri, IDictionary tags, string @event) - { - ArgumentNullException.ThrowIfNull(azureProxy); - ArgumentNullException.ThrowIfNull(storageAccessProvider); - ArgumentNullException.ThrowIfNull(blobAbsoluteUri); - ArgumentNullException.ThrowIfNull(tags); - ArgumentNullException.ThrowIfNull(@event); - - if (tags.Count == 0) - { - throw new ArgumentException("This message has no tags.", nameof(tags)); - } - - if (tags.ContainsKey(ProcessedTag)) - { - throw new ArgumentException("This message was already processed.", nameof(tags)); - } - - // There are up to 10 tags allowed. We will be adding one. - // https://learn.microsoft.com/azure/storage/blobs/storage-manage-find-blobs?tabs=azure-portal#setting-blob-index-tags - if (tags.Count > 9) - { - throw new ArgumentException("This message does not have space to add the processed tag.", nameof(tags)); - } - - _azureProxy = azureProxy; - _logger = logger; - _storageAccessProvider = storageAccessProvider; - _uri = blobAbsoluteUri; - Tags = tags.AsReadOnly(); - Event = @event; - } - - /// - /// Gets the details of this event message. - /// - /// - /// - public async Task<(string Id, AzureBatchTaskState State)> GetMessageBatchStateAsync(CancellationToken cancellationToken) - { - var messageText = await _azureProxy.DownloadBlobAsync(_uri, cancellationToken); - var result = Newtonsoft.Json.JsonConvert.DeserializeObject(messageText); - - // TODO: throw if null - // Validate. Suggestions include: - //Guid.TryParse(result.Id, out _) - //Tes.Runner.Events.EventsPublisher.EventVersion.Equals(result.EventVersion, StringComparison.Ordinal) - //Tes.Runner.Events.EventsPublisher.EventDataVersion.Equals(result.EventDataVersion, StringComparison.Ordinal) - //Tes.Runner.Events.EventsPublisher.TesTaskRunnerEntityType.Equals(result.EntityType, StringComparison.Ordinal) - //Tes.Runner.Events.EventsPublisher.TesTaskRunnerEntityType.Equals(result.EntityType, StringComparison.Ordinal) - //Event.Equals(result.Name, StringComparison.Ordinal) - //new[] { Tes.Runner.Events.EventsPublisher.StartedStatus, Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage) - - // Event type specific validations - // - - _logger.LogDebug("Getting batch task state from event {EventName} for {TesTask}.", result.Name ?? Event, result.EntityId); - return (result.EntityId, GetCompletedBatchState(result)); - } - - /// - /// Marks this event message processed. - /// - /// - /// - public async Task MarkMessageProcessed(CancellationToken cancellationToken) - { - await _azureProxy.SetBlobTags( - _uri, - Tags.Append(new(ProcessedTag, DateTime.UtcNow.ToString("O"))) - .ToDictionary(pair => pair.Key, pair => pair.Value), - cancellationToken); - } - - private /*static*/ AzureBatchTaskState GetCompletedBatchState(Tes.Runner.Events.EventMessage task) - { - return (task.Name ?? Event) switch - { - Tes.Runner.Events.EventsPublisher.TaskCompletionEvent => string.IsNullOrWhiteSpace(task.EventData["errorMessage"]) - - ? new( - AzureBatchTaskState.TaskState.CompletedSuccessfully, - BatchTaskStartTime: task.Created - TimeSpan.Parse(task.EventData["duration"]), - BatchTaskEndTime: task.Created/*, - BatchTaskExitCode: 0*/) - - : new( - AzureBatchTaskState.TaskState.CompletedWithErrors, - Failure: new("ExecutorError", - Enumerable.Empty() - .Append(task.EventData["errorMessage"])), - BatchTaskStartTime: task.Created - TimeSpan.Parse(task.EventData["duration"]), - BatchTaskEndTime: task.Created/*, - BatchTaskExitCode: 0*/), - - Tes.Runner.Events.EventsPublisher.ExecutorStartEvent => new(AzureBatchTaskState.TaskState.Running), - - // TODO: the rest - _ => new(AzureBatchTaskState.TaskState.NodePreempted), //throw new System.Diagnostics.UnreachableException(), - }; - } - } - - // TODO: Consider moving this class's implementation to Startup - /// - /// Factory to create TesEventMessage instances. - /// - /// This can be a singleton in DI. - public sealed class BatchTesEventMessageFactory - { - private readonly IServiceProvider _serviceProvider; - - /// - /// Constructor for . - /// - /// A service object. - public BatchTesEventMessageFactory(IServiceProvider serviceProvider) => _serviceProvider = serviceProvider; - - /// - /// Creates a new . - /// - /// - /// - /// - /// - public TesEventMessage CreateNew(Uri blobAbsoluteUri, IDictionary tags, string @event) - => ActivatorUtilities.CreateInstance(_serviceProvider, blobAbsoluteUri, tags, @event); - } -} From d2ccfb2634f94e887cd63477b8ab9dd565e6f072 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 18 Oct 2023 19:24:39 -0700 Subject: [PATCH 045/202] Swap boolean flags for explicit permissions: trial the first --- src/TesApi.Tests/BatchSchedulerTests.cs | 2 +- .../TaskExecutionScriptingManagerTests.cs | 6 +- .../Runner/TaskToNodeTaskConverterTests.cs | 2 +- .../DefaultStorageAccessProviderTests.cs | 7 +- .../TerraStorageAccessProviderTests.cs | 18 ++-- src/TesApi.Web/BatchScheduler.cs | 17 ++-- src/TesApi.Web/ConfigurationUtils.cs | 4 +- .../Runner/TaskExecutionScriptingManager.cs | 4 +- .../Runner/TaskToNodeTaskConverter.cs | 2 +- .../Storage/DefaultStorageAccessProvider.cs | 85 ++++++++++++------- .../Storage/IStorageAccessProvider.cs | 48 +++++++---- .../Storage/StorageAccessProvider.cs | 18 ++-- .../Storage/TerraStorageAccessProvider.cs | 17 ++-- 13 files changed, 142 insertions(+), 88 deletions(-) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index d7833b122..5bb2f3084 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -1380,7 +1380,7 @@ public async Task CromwellWriteFilesAreDiscoveredAndAddedIfMissedWithContentScri var storageAccessProvider = serviceProvider.GetServiceOrCreateInstance(); var commandScriptDir = new UriBuilder(commandScriptUri) { Path = Path.GetDirectoryName(commandScriptUri.AbsolutePath).Replace('\\', '/') }.Uri; - executionDirectoryUri = UrlMutableSASEqualityComparer.TrimUri(new Uri(storageAccessProvider.MapLocalPathToSasUrlAsync(commandScriptDir.IsFile ? commandScriptDir.AbsolutePath : commandScriptDir.AbsoluteUri, CancellationToken.None, getContainerSas: true).Result)); + executionDirectoryUri = UrlMutableSASEqualityComparer.TrimUri(new Uri(storageAccessProvider.MapLocalPathToSasUrlAsync(commandScriptDir.IsFile ? commandScriptDir.AbsolutePath : commandScriptDir.AbsoluteUri, Azure.Storage.Sas.BlobSasPermissions.List, CancellationToken.None).Result)); serviceProvider.AzureProxy.Setup(p => p.ListBlobsAsync(It.Is(executionDirectoryUri, new UrlMutableSASEqualityComparer()), It.IsAny())).Returns(executionDirectoryBlobs.ToAsyncEnumerable()); diff --git a/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs b/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs index 44606b05d..086690d71 100644 --- a/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs +++ b/src/TesApi.Tests/Runner/TaskExecutionScriptingManagerTests.cs @@ -43,11 +43,11 @@ public void SetUp() storageAccessProviderMock = new Mock(); storageAccessProviderMock.Setup(x => x.GetInternalTesTaskBlobUrlAsync(It.IsAny(), It.IsAny(), - It.IsAny(), It.IsAny())) + It.IsAny(), It.IsAny())) .ReturnsAsync(AssetUrl); storageAccessProviderMock.Setup(x => - x.GetInternalTesBlobUrlAsync(It.IsAny(), It.IsAny(), - It.IsAny(), It.IsAny())) + x.GetInternalTesBlobUrlAsync(It.IsAny(), + It.IsAny(), It.IsAny())) .ReturnsAsync(AssetUrl); taskToNodeTaskConverterMock = new Mock(); diff --git a/src/TesApi.Tests/Runner/TaskToNodeTaskConverterTests.cs b/src/TesApi.Tests/Runner/TaskToNodeTaskConverterTests.cs index d4fcffab1..2db31c6e2 100644 --- a/src/TesApi.Tests/Runner/TaskToNodeTaskConverterTests.cs +++ b/src/TesApi.Tests/Runner/TaskToNodeTaskConverterTests.cs @@ -53,7 +53,7 @@ public void SetUp() storageOptions = new StorageOptions() { ExternalStorageContainers = ExternalStorageContainerWithSas }; storageAccessProviderMock = new Mock(); storageAccessProviderMock.Setup(x => - x.GetInternalTesTaskBlobUrlAsync(It.IsAny(), It.IsAny(), It.IsAny(), It.IsAny())) + x.GetInternalTesTaskBlobUrlAsync(It.IsAny(), It.IsAny(), It.IsAny(), It.IsAny())) .ReturnsAsync(InternalBlobUrlWithSas); storageAccessProviderMock.Setup(x => x.GetInternalTesBlobUrlWithoutSasToken(It.IsAny())) diff --git a/src/TesApi.Tests/Storage/DefaultStorageAccessProviderTests.cs b/src/TesApi.Tests/Storage/DefaultStorageAccessProviderTests.cs index 772ef6894..b18796f18 100644 --- a/src/TesApi.Tests/Storage/DefaultStorageAccessProviderTests.cs +++ b/src/TesApi.Tests/Storage/DefaultStorageAccessProviderTests.cs @@ -20,7 +20,7 @@ namespace TesApi.Tests.Storage [TestClass, TestCategory("Unit")] public class DefaultStorageAccessProviderTests { - private IStorageAccessProvider defaultStorageAccessProvider; + private DefaultStorageAccessProvider defaultStorageAccessProvider; private Mock azureProxyMock; private StorageOptions storageOptions; private StorageAccountInfo storageAccountInfo; @@ -52,7 +52,7 @@ public async Task GetInternalTesTaskBlobUrlAsync_BlobPathIsProvided_ReturnsValid string blobName) { var task = new TesTask { Name = "taskName", Id = Guid.NewGuid().ToString() }; - var url = await defaultStorageAccessProvider.GetInternalTesTaskBlobUrlAsync(task, blobName, CancellationToken.None); + var url = await defaultStorageAccessProvider.GetInternalTesTaskBlobUrlAsync(task, blobName, Azure.Storage.Sas.BlobSasPermissions.Read, CancellationToken.None); Assert.IsNotNull(url); var uri = new Uri(url); @@ -78,7 +78,8 @@ public async Task GetInternalTesTaskBlobUrlAsync_BlobPathAndInternalPathPrefixIs { { TesResources.SupportedBackendParameters.internal_path_prefix.ToString(), internalPathPrefix } }; - var url = await defaultStorageAccessProvider.GetInternalTesTaskBlobUrlAsync(task, blobName, CancellationToken.None); + + var url = await defaultStorageAccessProvider.GetInternalTesTaskBlobUrlAsync(task, blobName, Azure.Storage.Sas.BlobSasPermissions.Read, CancellationToken.None); Assert.IsNotNull(url); var uri = new Uri(url); diff --git a/src/TesApi.Tests/TerraStorageAccessProviderTests.cs b/src/TesApi.Tests/TerraStorageAccessProviderTests.cs index ab0971ab8..0a0549fd9 100644 --- a/src/TesApi.Tests/TerraStorageAccessProviderTests.cs +++ b/src/TesApi.Tests/TerraStorageAccessProviderTests.cs @@ -29,7 +29,7 @@ public class TerraStorageAccessProviderTests private Mock wsmApiClientMock; private Mock azureProxyMock; - private IStorageAccessProvider terraStorageAccessProvider; + private TerraStorageAccessProvider terraStorageAccessProvider; private TerraApiStubData terraApiStubData; private Mock> optionsMock; private TerraOptions terraOptions; @@ -73,7 +73,7 @@ public async Task MapLocalPathToSasUrlAsync_ValidInput(string input) { SetUpTerraApiClient(); - var result = await terraStorageAccessProvider.MapLocalPathToSasUrlAsync(input, CancellationToken.None); + var result = await terraStorageAccessProvider.MapLocalPathToSasUrlAsync(input, Azure.Storage.Sas.BlobSasPermissions.Read, CancellationToken.None, sasTokenDuration: default); Assert.IsNotNull(terraApiStubData.GetWsmSasTokenApiResponse().Url, result); } @@ -102,7 +102,7 @@ public async Task MapLocalPathToSasUrlAsync_GetContainerSasIsTrue(string input, { SetUpTerraApiClient(); - var result = await terraStorageAccessProvider.MapLocalPathToSasUrlAsync(input + blobPath, CancellationToken.None, getContainerSas: true); + var result = await terraStorageAccessProvider.MapLocalPathToSasUrlAsync(input + blobPath, Azure.Storage.Sas.BlobSasPermissions.List, CancellationToken.None, sasTokenDuration: default); Assert.IsNotNull(result); Assert.AreEqual($"{expected}?sv={TerraApiStubData.SasToken}", result); @@ -118,7 +118,7 @@ public async Task MapLocalPathToSasUrlAsync_GetContainerSasIsTrue(string input, [ExpectedException(typeof(InvalidOperationException))] public async Task MapLocalPathToSasUrlAsync_InvalidStorageAccountInputs(string input) { - await terraStorageAccessProvider.MapLocalPathToSasUrlAsync(input, CancellationToken.None); + await terraStorageAccessProvider.MapLocalPathToSasUrlAsync(input, Azure.Storage.Sas.BlobSasPermissions.Read, CancellationToken.None, sasTokenDuration: default); } [TestMethod] @@ -145,7 +145,7 @@ public async Task GetInternalTesBlobUrlAsync_BlobPathIsProvided_ReturnsValidURLW { SetUpTerraApiClient(); - var url = await terraStorageAccessProvider.GetInternalTesBlobUrlAsync(blobName, CancellationToken.None); + var url = await terraStorageAccessProvider.GetInternalTesBlobUrlAsync(blobName, Azure.Storage.Sas.BlobSasPermissions.Read, CancellationToken.None); Assert.IsNotNull(url); var uri = new Uri(url); @@ -161,7 +161,7 @@ public async Task GetInternalTesTaskBlobUrlAsync_BlobPathIsProvided_ReturnsValid { SetUpTerraApiClient(); var task = new TesTask { Name = "taskName", Id = Guid.NewGuid().ToString() }; - var url = await terraStorageAccessProvider.GetInternalTesTaskBlobUrlAsync(task, blobName, CancellationToken.None); + var url = await terraStorageAccessProvider.GetInternalTesTaskBlobUrlAsync(task, blobName, Azure.Storage.Sas.BlobSasPermissions.Read, CancellationToken.None); Assert.IsNotNull(url); var uri = new Uri(url); @@ -184,7 +184,8 @@ public async Task GetInternalTesTaskBlobUrlAsync_BlobPathAndInternalPathPrefixIs { { TesResources.SupportedBackendParameters.internal_path_prefix.ToString(), internalPathPrefix } }; - var url = await terraStorageAccessProvider.GetInternalTesTaskBlobUrlAsync(task, blobName, CancellationToken.None); + + var url = await terraStorageAccessProvider.GetInternalTesTaskBlobUrlAsync(task, blobName, Azure.Storage.Sas.BlobSasPermissions.Read, CancellationToken.None); Assert.IsNotNull(url); var uri = new Uri(url); @@ -207,7 +208,8 @@ public async Task GetInternalTesTaskBlobUrlAsync_BlobPathAndInternalPathPrefixAr { { TesResources.SupportedBackendParameters.internal_path_prefix.ToString(), internalPrefix } }; - var url = await terraStorageAccessProvider.GetInternalTesTaskBlobUrlAsync(task, blobPath, CancellationToken.None); + + var url = await terraStorageAccessProvider.GetInternalTesTaskBlobUrlAsync(task, blobPath, Azure.Storage.Sas.BlobSasPermissions.Read, CancellationToken.None); Assert.IsNotNull(url); Assert.AreNotEqual('/', capturedTokenApiParameters.SasBlobName[0]); diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 7533e7cb0..fdbcfdf3f 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -366,7 +366,7 @@ public async Task LoadExistingPoolsAsync(CancellationToken cancellationToken) /// public async Task UploadTaskRunnerIfNeeded(CancellationToken cancellationToken) { - var blobUri = new Uri(await storageAccessProvider.GetInternalTesBlobUrlAsync(NodeTaskRunnerFilename, cancellationToken, needsWrite: true)); + var blobUri = new Uri(await storageAccessProvider.GetInternalTesBlobUrlAsync(NodeTaskRunnerFilename, storageAccessProvider.BlobPermissionsWithWrite, cancellationToken)); var blobProperties = await azureProxy.GetBlobPropertiesAsync(blobUri, cancellationToken); if (!(await File.ReadAllTextAsync(Path.Combine(AppContext.BaseDirectory, $"scripts/{NodeTaskRunnerMD5HashFilename}"), cancellationToken)).Trim().Equals(blobProperties?.ContentMD5, StringComparison.OrdinalIgnoreCase)) { @@ -804,7 +804,7 @@ private async Task> GetExistingBlobsInCromwellStorageLocationAsTe } var executionDirectoryUri = await storageAccessProvider.MapLocalPathToSasUrlAsync(cromwellExecutionDirectoryUrl, - cancellationToken, getContainerSas: true); + storageAccessProvider.DefaultContainerPermissions, cancellationToken); if (executionDirectoryUri is not null) { var blobsInExecutionDirectory = @@ -882,7 +882,7 @@ private void ValidateTesTaskInput(TesInput inputFile, TesTask tesTask) var globalStartTaskConfigured = !string.IsNullOrWhiteSpace(globalStartTaskPath); var startTaskSasUrl = globalStartTaskConfigured - ? await storageAccessProvider.MapLocalPathToSasUrlAsync(globalStartTaskPath, cancellationToken, sasTokenDuration: BatchPoolService.RunInterval.Multiply(2).Add(poolLifetime).Add(TimeSpan.FromMinutes(15))) + ? await storageAccessProvider.MapLocalPathToSasUrlAsync(globalStartTaskPath, storageAccessProvider.DefaultBlobPermissions, cancellationToken, sasTokenDuration: BatchPoolService.RunInterval.Multiply(2).Add(poolLifetime).Add(TimeSpan.FromMinutes(15))) : default; if (startTaskSasUrl is not null) @@ -1344,14 +1344,21 @@ public async IAsyncEnumerable GetEventMessagesAsync([System.Ru var eventsStartIndex = (string.IsNullOrEmpty(tesInternalSegments.BlobName) ? string.Empty : (tesInternalSegments.BlobName + "/")).Length; var eventsEndIndex = eventsStartIndex + eventsFolderName.Length + 1; - await foreach (var blobItem in azureProxy.ListBlobsWithTagsAsync(new(await storageAccessProvider.GetInternalTesBlobUrlAsync(string.Empty, cancellationToken, needsTags: true)), prefix, cancellationToken).WithCancellation(cancellationToken)) + await foreach (var blobItem in azureProxy.ListBlobsWithTagsAsync( + new(await storageAccessProvider.GetInternalTesBlobUrlAsync( + string.Empty, + Azure.Storage.Sas.BlobSasPermissions.Read | Azure.Storage.Sas.BlobSasPermissions.Tag | Azure.Storage.Sas.BlobSasPermissions.List, + cancellationToken)), + prefix, + cancellationToken) + .WithCancellation(cancellationToken)) { if (blobItem.Tags.ContainsKey(NodeEventMessage.ProcessedTag) || !blobItem.Tags.ContainsKey("task-id")) { continue; } - var blobUrl = await storageAccessProvider.GetInternalTesBlobUrlAsync(blobItem.Name[eventsStartIndex..], cancellationToken, needsTags: true, needsWrite: true); + var blobUrl = await storageAccessProvider.GetInternalTesBlobUrlAsync(blobItem.Name[eventsStartIndex..], storageAccessProvider.BlobPermissionsWithWriteAndTag, cancellationToken); var pathFromEventName = blobItem.Name[eventsEndIndex..]; var eventName = pathFromEventName[..pathFromEventName.IndexOf('/')]; diff --git a/src/TesApi.Web/ConfigurationUtils.cs b/src/TesApi.Web/ConfigurationUtils.cs index c51f904f4..207182c3b 100644 --- a/src/TesApi.Web/ConfigurationUtils.cs +++ b/src/TesApi.Web/ConfigurationUtils.cs @@ -72,8 +72,8 @@ public ConfigurationUtils( /// public async Task> ProcessAllowedVmSizesConfigurationFileAsync(CancellationToken cancellationToken) { - var supportedVmSizesUrl = new Uri(await storageAccessProvider.GetInternalTesBlobUrlAsync("/configuration/supported-vm-sizes", cancellationToken, needsWrite: true)); - var allowedVmSizesUrl = new Uri(await storageAccessProvider.GetInternalTesBlobUrlAsync("/configuration/allowed-vm-sizes", cancellationToken, needsWrite: true)); + var supportedVmSizesUrl = new Uri(await storageAccessProvider.GetInternalTesBlobUrlAsync("/configuration/supported-vm-sizes", storageAccessProvider.BlobPermissionsWithWrite, cancellationToken)); + var allowedVmSizesUrl = new Uri(await storageAccessProvider.GetInternalTesBlobUrlAsync("/configuration/allowed-vm-sizes", storageAccessProvider.BlobPermissionsWithWrite, cancellationToken)); var supportedVmSizes = (await skuInformationProvider.GetVmSizesAndPricesAsync(batchAccountResourceInformation.Region, cancellationToken)).ToList(); var batchAccountQuotas = await quotaProvider.GetVmCoreQuotaAsync(lowPriority: false, cancellationToken: cancellationToken); diff --git a/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs b/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs index 08d8eb73f..0cc3782a7 100644 --- a/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs +++ b/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs @@ -125,7 +125,7 @@ private async Task CreateAndUploadBatchScriptAsync(TesTask tesTask, stri { logger.LogInformation($"Creating and uploading Batch script for Task ID: {tesTask.Id}"); - var nodeTaskRunnerUrl = await storageAccessProvider.GetInternalTesBlobUrlAsync(NodeTaskRunnerFilename, cancellationToken); + var nodeTaskRunnerUrl = await storageAccessProvider.GetInternalTesBlobUrlAsync(NodeTaskRunnerFilename, storageAccessProvider.DefaultBlobPermissions, cancellationToken); var batchNodeScript = batchNodeScriptBuilder .WithAlpineWgetInstallation() @@ -166,7 +166,7 @@ private async Task UploadContentAsBlobToInternalTesLocationAsync(TesTask string content, string fileName, CancellationToken cancellationToken) { var blobUrl = - await storageAccessProvider.GetInternalTesTaskBlobUrlAsync(tesTask, fileName, cancellationToken, needsWrite: true); + await storageAccessProvider.GetInternalTesTaskBlobUrlAsync(tesTask, fileName, storageAccessProvider.BlobPermissionsWithWrite, cancellationToken); await storageAccessProvider.UploadBlobAsync(new Uri(blobUrl), content, cancellationToken); return blobUrl; diff --git a/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs b/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs index 6c8575901..35c773830 100644 --- a/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs +++ b/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs @@ -320,7 +320,7 @@ private async Task UploadContentAndCreateTesInputAsync(TesTask tesTask { var inputFileUrl = await storageAccessProvider.GetInternalTesTaskBlobUrlAsync(tesTask, Guid.NewGuid().ToString(), - cancellationToken, needsWrite: true); + storageAccessProvider.BlobPermissionsWithWrite, cancellationToken); //return the URL without the SAS token, the runner will add it using the transformation strategy await storageAccessProvider.UploadBlobAsync(new Uri(inputFileUrl), content, cancellationToken); diff --git a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs index 1542a6216..dfe513a42 100644 --- a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs @@ -23,7 +23,6 @@ namespace TesApi.Web.Storage /// public class DefaultStorageAccessProvider : StorageAccessProvider { - private static readonly TimeSpan SasTokenDuration = TimeSpan.FromDays(7); //TODO: refactor this to drive it from configuration. private readonly string defaultStorageAccountName; private readonly List externalStorageContainers; @@ -89,12 +88,7 @@ public override async Task IsPublicHttpUrlAsync(string uriString, Cancella } /// - public override Task MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, TimeSpan? sasTokenDuration, bool getContainerSas) - { - return MapLocalPathToSasUrlImplAsync(path, sasTokenDuration, getContainerSas, cancellationToken); - } - - private async Task MapLocalPathToSasUrlImplAsync(string path, TimeSpan? sasTokenDuration, bool getContainerSas, CancellationToken cancellationToken) + public override async Task MapLocalPathToSasUrlAsync(string path, BlobSasPermissions sasPermissions, CancellationToken cancellationToken, TimeSpan? sasTokenDuration) { // TODO: Optional: If path is /container/... where container matches the name of the container in the default storage account, prepend the account name to the path. // This would allow the user to omit the account name for files stored in the default storage account @@ -120,7 +114,9 @@ private async Task MapLocalPathToSasUrlImplAsync(string path, TimeSpan? { try { - var result = await AddSasTokenAsync(pathSegments, sasTokenDuration, getContainerSas, needsTags: false, needsWrite: true, cancellationToken: cancellationToken, path: path); + var result = pathSegments.IsContainer + ? await AddSasTokenAsync(pathSegments, sasTokenDuration, ConvertSasPermissions(sasPermissions, nameof(sasPermissions)), path: path, cancellationToken: cancellationToken) + : await AddSasTokenAsync(pathSegments, sasTokenDuration, sasPermissions, path: path, cancellationToken: cancellationToken); return result.ToUriString(); } catch @@ -130,7 +126,27 @@ private async Task MapLocalPathToSasUrlImplAsync(string path, TimeSpan? } } - private async Task AddSasTokenAsync(StorageAccountUrlSegments pathSegments, TimeSpan? sasTokenDuration, bool getContainerSas, bool? needsTags, bool? needsWrite, CancellationToken cancellationToken, string path = default) + private Task AddSasTokenAsync(StorageAccountUrlSegments pathSegments, TimeSpan? sasTokenDuration, BlobSasPermissions blobPermissions, CancellationToken cancellationToken, string path = default) + { + if (pathSegments.IsContainer) + { + throw new ArgumentException(nameof(blobPermissions), "BlobContainerSasPermissions must be used with containers."); + } + + return AddSasTokenAsyncImpl(pathSegments, sasTokenDuration, expiresOn => new BlobSasBuilder(blobPermissions, expiresOn), path, cancellationToken); + } + + private Task AddSasTokenAsync(StorageAccountUrlSegments pathSegments, TimeSpan? sasTokenDuration, BlobContainerSasPermissions containerPermissions, CancellationToken cancellationToken, string path = default) + { + if (!pathSegments.IsContainer) + { + throw new ArgumentException(nameof(containerPermissions), "BlobSasPermissions must be used with blobs."); + } + + return AddSasTokenAsyncImpl(pathSegments, sasTokenDuration, expiresOn => new BlobSasBuilder(containerPermissions, expiresOn), path, cancellationToken); + } + + private async Task AddSasTokenAsyncImpl(StorageAccountUrlSegments pathSegments, TimeSpan? sasTokenDuration, Func createBuilder, string path, CancellationToken cancellationToken) { StorageAccountInfo storageAccountInfo = null; @@ -145,25 +161,11 @@ private async Task AddSasTokenAsync(StorageAccountUrl var accountKey = await AzureProxy.GetStorageAccountKeyAsync(storageAccountInfo, cancellationToken); var resultPathSegments = new StorageAccountUrlSegments(storageAccountInfo.BlobEndpoint, pathSegments.ContainerName, pathSegments.BlobName); - var sasBlobPermissions = BlobSasPermissions.Read; - var sasContainerPermissions = BlobContainerSasPermissions.Add | BlobContainerSasPermissions.Create | BlobContainerSasPermissions.List | BlobContainerSasPermissions.Read | BlobContainerSasPermissions.Write; - - if (needsTags.GetValueOrDefault()) - { - sasContainerPermissions |= BlobContainerSasPermissions.Tag; - sasBlobPermissions |= BlobSasPermissions.Tag; - } - - if (needsWrite.GetValueOrDefault()) - { - sasBlobPermissions |= BlobSasPermissions.Add | BlobSasPermissions.Create | BlobSasPermissions.List | BlobSasPermissions.Write; - } - var expiresOn = DateTimeOffset.UtcNow.Add((sasTokenDuration ?? TimeSpan.Zero) + SasTokenDuration); - var builder = pathSegments.IsContainer || getContainerSas ? new BlobSasBuilder(sasContainerPermissions, expiresOn) : new BlobSasBuilder(sasBlobPermissions, expiresOn); + var builder = createBuilder(expiresOn); builder.BlobContainerName = resultPathSegments.ContainerName; - builder.BlobName = pathSegments.IsContainer || getContainerSas ? string.Empty : resultPathSegments.BlobName; + builder.BlobName = resultPathSegments.BlobName; builder.Protocol = SasProtocol.Https; resultPathSegments.SasToken = builder.ToSasQueryParameters(new StorageSharedKeyCredential(storageAccountInfo.Name, accountKey)).ToString(); @@ -177,13 +179,36 @@ private async Task AddSasTokenAsync(StorageAccountUrl } /// - public override async Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken, bool? needsTags, bool? needsWrite) + public override async Task GetInternalTesBlobUrlAsync(string blobPath, BlobSasPermissions sasPermissions, CancellationToken cancellationToken) { var pathSegments = StorageAccountUrlSegments.Create(GetInternalTesBlobUrlWithoutSasToken(blobPath)); - var resultPathSegments = await AddSasTokenAsync(pathSegments, SasTokenDuration, false, needsTags, needsWrite, cancellationToken); + + var resultPathSegments = pathSegments.IsContainer + ? await AddSasTokenAsync(pathSegments, SasTokenDuration, ConvertSasPermissions(sasPermissions, nameof(sasPermissions)), cancellationToken) + : await AddSasTokenAsync(pathSegments, SasTokenDuration, sasPermissions, cancellationToken); return resultPathSegments.ToUriString(); } + private static BlobContainerSasPermissions ConvertSasPermissions(BlobSasPermissions sasPermissions, string paramName) + { + BlobContainerSasPermissions result = 0; + + if (sasPermissions.HasFlag(BlobSasPermissions.Read)) { result |= BlobContainerSasPermissions.Read; } + if (sasPermissions.HasFlag(BlobSasPermissions.Add)) { result |= BlobContainerSasPermissions.Add; } + if (sasPermissions.HasFlag(BlobSasPermissions.Create)) { result |= BlobContainerSasPermissions.Create; } + if (sasPermissions.HasFlag(BlobSasPermissions.Write)) { result |= BlobContainerSasPermissions.Write; } + if (sasPermissions.HasFlag(BlobSasPermissions.Delete)) { result |= BlobContainerSasPermissions.Delete; } + if (sasPermissions.HasFlag(BlobSasPermissions.Tag)) { result |= BlobContainerSasPermissions.Tag; } + if (sasPermissions.HasFlag(BlobSasPermissions.DeleteBlobVersion)) { result |= BlobContainerSasPermissions.DeleteBlobVersion; } + if (sasPermissions.HasFlag(BlobSasPermissions.List)) { result |= BlobContainerSasPermissions.List; } + if (sasPermissions.HasFlag(BlobSasPermissions.Move)) { result |= BlobContainerSasPermissions.Move; } + if (sasPermissions.HasFlag(BlobSasPermissions.Execute)) { result |= BlobContainerSasPermissions.Execute; } + if (sasPermissions.HasFlag(BlobSasPermissions.SetImmutabilityPolicy)) { result |= BlobContainerSasPermissions.SetImmutabilityPolicy; } + if (sasPermissions.HasFlag(BlobSasPermissions.PermanentDelete)) { throw new ArgumentOutOfRangeException(paramName, nameof(BlobSasPermissions.PermanentDelete), "Permission that cannot be applied to container was provided."); } + + return result; + } + private static string NormalizedBlobPath(string blobPath) { return string.IsNullOrEmpty(blobPath) ? string.Empty : $"/{blobPath.TrimStart('/')}"; @@ -219,7 +244,7 @@ public override string GetInternalTesBlobUrlWithoutSasToken(string blobPath) } /// - public override async Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, CancellationToken cancellationToken, bool? needsWrite) + public override async Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, BlobSasPermissions sasPermissions, CancellationToken cancellationToken) { var normalizedBlobPath = NormalizedBlobPath(blobPath); @@ -229,10 +254,10 @@ public override async Task GetInternalTesTaskBlobUrlAsync(TesTask task, var blobPathWithPathPrefix = $"/{defaultStorageAccountName}/{task.Resources.GetBackendParameterValue(TesResources.SupportedBackendParameters.internal_path_prefix).Trim('/')}{normalizedBlobPath}"; - return await This.MapLocalPathToSasUrlAsync(blobPathWithPathPrefix, cancellationToken, getContainerSas: needsWrite ?? false); + return await MapLocalPathToSasUrlAsync(blobPathWithPathPrefix, sasPermissions, cancellationToken, sasTokenDuration: default); } - return await This.GetInternalTesBlobUrlAsync($"/{task.Id}{normalizedBlobPath}", cancellationToken, needsWrite: needsWrite); + return await GetInternalTesBlobUrlAsync($"/{task.Id}{normalizedBlobPath}", sasPermissions, cancellationToken); } private async Task TryGetStorageAccountInfoAsync(string accountName, CancellationToken cancellationToken, Action onSuccess = null) diff --git a/src/TesApi.Web/Storage/IStorageAccessProvider.cs b/src/TesApi.Web/Storage/IStorageAccessProvider.cs index b1f254ee8..9d6b485b7 100644 --- a/src/TesApi.Web/Storage/IStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/IStorageAccessProvider.cs @@ -4,6 +4,7 @@ using System; using System.Threading; using System.Threading.Tasks; +using Azure.Storage.Sas; using Tes.Models; namespace TesApi.Web.Storage @@ -13,6 +14,32 @@ namespace TesApi.Web.Storage /// public interface IStorageAccessProvider { + /// + /// SAS permissions previously given all containers and when container SAS was requested. + /// + public BlobSasPermissions DefaultContainerPermissions => BlobSasPermissions.Add | BlobSasPermissions.Create | BlobSasPermissions.List | BlobSasPermissions.Read | BlobSasPermissions.Write; + + + /// + /// SAS permissions previously given all blobs when container SAS was not requested. + /// + public BlobSasPermissions DefaultBlobPermissions => BlobSasPermissions.Read; + + /// + /// SAS default blob permissions including Create/Write. + /// + public BlobSasPermissions BlobPermissionsWithWrite => DefaultBlobPermissions | BlobSasPermissions.Add | BlobSasPermissions.Create | BlobSasPermissions.List | BlobSasPermissions.Write; + + /// + /// SAS default blob permissions including Tag. + /// + public BlobSasPermissions BlobPermissionsWithTag => DefaultBlobPermissions | BlobSasPermissions.Tag; + + /// + /// SAS default blob permissions including Create/Write and Tag. + /// + public BlobSasPermissions BlobPermissionsWithWriteAndTag => BlobPermissionsWithWrite | BlobSasPermissions.Tag; + /// /// Retrieves file content /// @@ -73,39 +100,30 @@ public interface IStorageAccessProvider /// - https://accountName.blob.core.windows.net/containerName/blobName /// /// The file path to convert. Two-part path is treated as container path. Paths with three or more parts are treated as blobs. + /// /// A for controlling the lifetime of the asynchronous operation. /// Duration SAS should be valid. - /// Get the container SAS even if path is longer than two parts /// An Azure Block Blob or Container URL with SAS token - public Task MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, TimeSpan? sasTokenDuration = default, bool getContainerSas = false); + public Task MapLocalPathToSasUrlAsync(string path, BlobSasPermissions sasPermissions, CancellationToken cancellationToken, TimeSpan? sasTokenDuration = default); /// /// Returns an Azure Storage Blob URL with a SAS token for the specified blob path in the TES internal storage location /// /// + /// /// - /// Optional. Read or write the tags on a blob. - /// Optional. Allow changing the blob. /// - /// - /// If the blobPath is not provided(empty), a container SAS token is generated. - /// If the blobPath is provided, a SAS token to the blobPath prefixed with the TES internal segments is generated. - /// - public Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken, bool? needsTags = default, bool? needsWrite = default); + public Task GetInternalTesBlobUrlAsync(string blobPath, BlobSasPermissions sasPermissions, CancellationToken cancellationToken); /// /// Returns an Azure Storage Blob URL with a SAS token for the specified blob path in the TES task internal storage location. /// /// /// + /// /// - /// Optional. Allow changing the blob. /// - /// - /// If the blobPath is not provided(empty), a container SAS token is generated. - /// If the blobPath is provided, a SAS token to the blobPath prefixed with the TES task internal segments is generated. - /// - public Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, CancellationToken cancellationToken, bool? needsWrite = default); + public Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, BlobSasPermissions sasPermissions, CancellationToken cancellationToken); /// /// Returns an Azure Storage Blob URL without a SAS token for the specified blob path in the TES task internal storage location. diff --git a/src/TesApi.Web/Storage/StorageAccessProvider.cs b/src/TesApi.Web/Storage/StorageAccessProvider.cs index 77ccb0a50..2c6eaa7a5 100644 --- a/src/TesApi.Web/Storage/StorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/StorageAccessProvider.cs @@ -4,6 +4,7 @@ using System; using System.Threading; using System.Threading.Tasks; +using Azure.Storage.Sas; using Microsoft.Extensions.Logging; using Tes.Models; @@ -44,15 +45,10 @@ protected StorageAccessProvider(ILogger logger, IAzureProxy azureProxy) this.AzureProxy = azureProxy; } - /// - /// Property to assist in directly accessing methods in this instance through the interface. - /// - protected IStorageAccessProvider This => this; - /// public async Task DownloadBlobAsync(string blobRelativePath, CancellationToken cancellationToken) { - var url = await This.MapLocalPathToSasUrlAsync(blobRelativePath, cancellationToken); + var url = await MapLocalPathToSasUrlAsync(blobRelativePath, BlobSasPermissions.Read, cancellationToken, sasTokenDuration: default); if (url is null) { @@ -85,7 +81,7 @@ public async Task DownloadBlobAsync(Uri blobAbsoluteUrl, CancellationTok /// public async Task UploadBlobAsync(string blobRelativePath, string content, CancellationToken cancellationToken) - => await this.AzureProxy.UploadBlobAsync(new Uri(await This.MapLocalPathToSasUrlAsync(blobRelativePath, cancellationToken, getContainerSas: true)), content, cancellationToken); + => await this.AzureProxy.UploadBlobAsync(new Uri(await MapLocalPathToSasUrlAsync(blobRelativePath, BlobSasPermissions.Create | BlobSasPermissions.Write, cancellationToken, sasTokenDuration: default)), content, cancellationToken); /// public async Task UploadBlobAsync(Uri blobAbsoluteUrl, string content, @@ -98,19 +94,19 @@ public async Task UploadBlobAsync(Uri blobAbsoluteUrl, string content, /// public async Task UploadBlobFromFileAsync(string blobRelativePath, string sourceLocalFilePath, CancellationToken cancellationToken) - => await this.AzureProxy.UploadBlobFromFileAsync(new Uri(await This.MapLocalPathToSasUrlAsync(blobRelativePath, cancellationToken, getContainerSas: true)), sourceLocalFilePath, cancellationToken); + => await this.AzureProxy.UploadBlobFromFileAsync(new Uri(await MapLocalPathToSasUrlAsync(blobRelativePath, BlobSasPermissions.Create | BlobSasPermissions.Write, cancellationToken, sasTokenDuration: default)), sourceLocalFilePath, cancellationToken); /// public abstract Task IsPublicHttpUrlAsync(string uriString, CancellationToken cancellationToken); /// - public abstract Task MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, TimeSpan? sasTokenDuration, bool getContainerSas); + public abstract Task MapLocalPathToSasUrlAsync(string path, BlobSasPermissions sasPermissions, CancellationToken cancellationToken, TimeSpan? sasTokenDuration); /// - public abstract Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken, bool? needsTags, bool? needsWrite); + public abstract Task GetInternalTesBlobUrlAsync(string blobPath, BlobSasPermissions sasPermissions, CancellationToken cancellationToken); /// - public abstract Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, CancellationToken cancellationToken, bool? needsWrite); + public abstract Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, BlobSasPermissions sasPermissions, CancellationToken cancellationToken); /// public abstract string GetInternalTesTaskBlobUrlWithoutSasToken(TesTask task, string blobPath); diff --git a/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs b/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs index d2841e601..6ee65dde9 100644 --- a/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs @@ -8,6 +8,7 @@ using System.Threading.Tasks; using System.Web; using Azure.Storage.Blobs; +using Azure.Storage.Sas; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Tes.ApiClients; @@ -82,9 +83,12 @@ public override Task IsPublicHttpUrlAsync(string uriString, CancellationTo } /// - public override async Task MapLocalPathToSasUrlAsync(string path, CancellationToken cancellationToken, TimeSpan? sasTokenDuration, bool getContainerSas) + public override async Task MapLocalPathToSasUrlAsync(string path, BlobSasPermissions sasPermissions, CancellationToken cancellationToken, TimeSpan? sasTokenDuration) { + // Currently all SAS tokens with Terra are R/W but sasPermissions so only List value is used to select between a Container SAS vs a Blob SAS. + ArgumentException.ThrowIfNullOrEmpty(path); + if (sasTokenDuration is not null) { throw new ArgumentException("Terra does not support extended length SAS tokens."); @@ -97,7 +101,7 @@ public override async Task MapLocalPathToSasUrlAsync(string path, Cancel var terraBlobInfo = await GetTerraBlobInfoFromContainerNameAsync(path, cancellationToken); - if (getContainerSas) + if (sasPermissions.HasFlag(BlobSasPermissions.List)) { return await GetMappedSasContainerUrlFromWsmAsync(terraBlobInfo, false, cancellationToken); } @@ -106,10 +110,11 @@ public override async Task MapLocalPathToSasUrlAsync(string path, Cancel } /// - public override async Task GetInternalTesBlobUrlAsync(string blobPath, CancellationToken cancellationToken, bool? needsTags, bool? needsWrite) + public override async Task GetInternalTesBlobUrlAsync(string blobPath, BlobSasPermissions sasPermissions, CancellationToken cancellationToken) { - // Currently all SAS tokens with Terra are R/W so needsWrite is waiting for a safer future. + // Currently all SAS tokens with Terra are R/W so sasPermissions is waiting for a safer future. + var needsTags = 0 != (sasPermissions & BlobSasPermissions.Tag); var blobInfo = GetTerraBlobInfoForInternalTes(blobPath); if (string.IsNullOrEmpty(blobPath)) @@ -121,9 +126,9 @@ public override async Task GetInternalTesBlobUrlAsync(string blobPath, C } /// - public override async Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, CancellationToken cancellationToken, bool? needsWrite) + public override async Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, BlobSasPermissions sasPermissions, CancellationToken cancellationToken) { - // Currently all SAS tokens with Terra are R/W so needsWrite is waiting for a safer future. + // Currently all SAS tokens with Terra are R/W so sasPermissions is waiting for a safer future. var blobInfo = GetTerraBlobInfoForInternalTesTask(task, blobPath); From 7ccc299c16aee684f2e0259b7f9ff8c7005d9cc2 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 24 Oct 2023 12:17:53 -0700 Subject: [PATCH 046/202] address found issues --- src/TesApi.Web/BatchPool.cs | 2 +- src/TesApi.Web/BatchPoolService.cs | 3 ++- src/TesApi.Web/BatchScheduler.cs | 6 +++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index e9b5e06c7..b628e62ff 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -564,7 +564,7 @@ TaskFailureInformation PopNextStartTaskFailure() /// public IAsyncEnumerable GetCompletedTasks(CancellationToken _1) - => GetTasksAsync("id,executionInfo", $"state eq 'completed' and stateTransitionTime lt {DateTime.UtcNow - TimeSpan.FromSeconds(30):O}"); + => GetTasksAsync("id,executionInfo", $"state eq 'completed' and stateTransitionTime lt '{DateTime.UtcNow - TimeSpan.FromSeconds(90):O}'"); // Returns false when pool/job was removed because it was not found. Returns true if the error was completely something else. private async ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken cancellationToken) diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index 61ce934d6..313e6ac19 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -25,6 +25,7 @@ internal class BatchPoolService : OrchestrateOnBatchSchedulerServiceBase /// Interval between each call to . /// public static readonly TimeSpan RunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs uses a 30 second polling interval + public static readonly TimeSpan CompletedCloudTasksRunInterval = TimeSpan.FromSeconds(90); /// /// Default constructor @@ -133,7 +134,7 @@ async ValueTask ProcessFailures(IAsyncEnumerable<(string taskId, AzureBatchTaskS /// private Task ExecuteCompletedTesTasksOnBatchAsync(CancellationToken stoppingToken) { - return ExecuteActionOnIntervalAsync(RunInterval, ProcessCompletedCloudTasksAsync, stoppingToken); + return ExecuteActionOnIntervalAsync(CompletedCloudTasksRunInterval, ProcessCompletedCloudTasksAsync, stoppingToken); } /// diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index fdbcfdf3f..0510453ce 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -776,14 +776,12 @@ private string GetNodeManagedIdentityResourceId(TesTask task) private async Task> GetAdditionalCromwellInputsAsync(TesTask task, CancellationToken cancellationToken) { var cromwellExecutionDirectoryUrl = GetCromwellExecutionDirectoryPathAsUrl(task); - var isCromwell = cromwellExecutionDirectoryUrl is not null; - // TODO: Cromwell bug: Cromwell command write_tsv() generates a file in the execution directory, for example execution/write_tsv_3922310b441805fc43d52f293623efbc.tmp. These are not passed on to TES inputs. // WORKAROUND: Get the list of files in the execution directory and add them to task inputs. // TODO: Verify whether this workaround is still needed. var additionalInputs = new List(); - if (isCromwell) + if (cromwellExecutionDirectoryUrl is not null) { additionalInputs = await GetExistingBlobsInCromwellStorageLocationAsTesInputsAsync(task, cromwellExecutionDirectoryUrl, @@ -805,10 +803,12 @@ private async Task> GetExistingBlobsInCromwellStorageLocationAsTe var executionDirectoryUri = await storageAccessProvider.MapLocalPathToSasUrlAsync(cromwellExecutionDirectoryUrl, storageAccessProvider.DefaultContainerPermissions, cancellationToken); + if (executionDirectoryUri is not null) { var blobsInExecutionDirectory = await azureProxy.ListBlobsAsync(new Uri(executionDirectoryUri), cancellationToken).ToListAsync(cancellationToken); + logger.LogDebug($"Found {blobsInExecutionDirectory.Count} items in cromwell's task execution directory."); var scriptBlob = blobsInExecutionDirectory.FirstOrDefault(b => b.Name.EndsWith($"/{CromwellScriptFileName}")); var commandScript = From 6363a369d476f174ee631db327ba65a0e1fb342e Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 25 Oct 2023 19:58:10 -0700 Subject: [PATCH 047/202] Replace returned type (TesTask TesTask, Task IsModifiedAsync) --- src/TesApi.Web/BatchScheduler.cs | 16 +++---- src/TesApi.Web/IBatchScheduler.cs | 6 +-- .../OrchestrateOnBatchSchedulerServiceBase.cs | 9 ++-- src/TesApi.Web/TesTaskTask.cs | 45 +++++++++++++++++++ 4 files changed, 62 insertions(+), 14 deletions(-) create mode 100644 src/TesApi.Web/TesTaskTask.cs diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index c4357f966..fce803049 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -375,15 +375,15 @@ public async Task UploadTaskRunnerIfNeeded(CancellationToken cancellationToken) } /// - public IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken) + public IAsyncEnumerable> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken) { ArgumentNullException.ThrowIfNull(tesTasks); ArgumentNullException.ThrowIfNull(taskStates); return taskStates.Zip(tesTasks, (TaskState, TesTask) => (TaskState, TesTask)) .Where(entry => entry.TesTask?.IsActiveState() ?? false) // Removes already terminal TesTasks from being further processed. - .Select(entry => (entry.TesTask, IsModifiedAsync: WrapHandleTesTaskTransitionAsync(entry.TesTask, entry.TaskState, cancellationToken))) - .WhenEach(cancellationToken, tuple => tuple.IsModifiedAsync); + .Select(entry => new TesTaskTask(WrapHandleTesTaskTransitionAsync(entry.TesTask, entry.TaskState, cancellationToken), entry.TesTask)) + .WhenEach(cancellationToken, tesTaskTask => tesTaskTask.Task); async Task WrapHandleTesTaskTransitionAsync(TesTask tesTask, AzureBatchTaskState azureBatchTaskState, CancellationToken cancellationToken) => await HandleTesTaskTransitionAsync(tesTask, azureBatchTaskState, cancellationToken); @@ -465,7 +465,7 @@ private static bool IsCromwellCommandScript(TesInput inputFile) => (inputFile.Name?.Equals("commandScript") ?? false) && (inputFile.Description?.EndsWith(".commandScript") ?? false) && inputFile.Type == TesFileType.FILEEnum && inputFile.Path.EndsWith($"/{CromwellScriptFileName}"); /// - public async IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + public async IAsyncEnumerable> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { var tasksMetadataByPoolKey = new Dictionary Identities, string PoolDisplayName)>>(); var poolKeyByTaskIds = new Dictionary(); @@ -514,7 +514,7 @@ private static bool IsCromwellCommandScript(TesInput inputFile) if (quickResult is not null) { tasks.Remove(tesTask); - yield return (tesTask, quickResult); + yield return new(quickResult, tesTask); } } @@ -545,7 +545,7 @@ private static bool IsCromwellCommandScript(TesInput inputFile) foreach (var (task, _, _, _, _) in listOfTaskMetadata) { tasks.Remove(task); - yield return (task, HandleException(exception, key, task)); + yield return new(HandleException(exception, key, task), task); } } } @@ -612,11 +612,11 @@ private static bool IsCromwellCommandScript(TesInput inputFile) if (quickResult is not null) { - yield return (tesTask, quickResult); + yield return new(quickResult, tesTask); } else { - yield return (tesTask, Task.FromResult(true)); + yield return new(Task.FromResult(true), tesTask); } } diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index e8233a05f..4e8d06e4d 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -42,15 +42,15 @@ public interface IBatchScheduler /// s corresponding to each . /// A for controlling the lifetime of the asynchronous operation. /// True for each corresponding that needs to be persisted. - IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken); + IAsyncEnumerable> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken); /// /// Schedule queued s on a batch system /// /// s to schedule on the batch system. /// A for controlling the lifetime of the asynchronous operation. - /// True for each corresponding that needs to be persisted. - IAsyncEnumerable<(TesTask TesTask, Task IsModifiedAsync)> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, CancellationToken cancellationToken); + /// True for each that needs to be persisted. + IAsyncEnumerable> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, CancellationToken cancellationToken); /// /// Adds to the managed batch pools. diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 93e710924..434b06676 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -145,7 +145,7 @@ protected async Task ExecuteActionOnIntervalAsync(TimeSpan runInterval, Func, performs an action in the batch system using , and updates the resultant state /// /// A System.Threading.Tasks.ValueTask that represents the long running operations. - protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func>> tesTaskGetter, Func IsModifiedAsync)>> tesTaskProcessor, CancellationToken stoppingToken) + protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func>> tesTaskGetter, Func>> tesTaskProcessor, CancellationToken stoppingToken) { var tesTasks = await (await tesTaskGetter(stoppingToken)).ToArrayAsync(stoppingToken); @@ -157,14 +157,17 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< var startTime = DateTime.UtcNow; - await foreach (var (tesTask, waitableResult) in tesTaskProcessor(tesTasks, stoppingToken).WithCancellation(stoppingToken)) + await foreach (var tesTaskTask in tesTaskProcessor(tesTasks, stoppingToken).WithCancellation(stoppingToken)) { + var tesTask = tesTaskTask.TesTask; + try { var isModified = false; + try { - isModified = await waitableResult; + isModified = await tesTaskTask; } catch (Exception exc) { diff --git a/src/TesApi.Web/TesTaskTask.cs b/src/TesApi.Web/TesTaskTask.cs new file mode 100644 index 000000000..1a56c1ea7 --- /dev/null +++ b/src/TesApi.Web/TesTaskTask.cs @@ -0,0 +1,45 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace TesApi.Web +{ + /// + /// Combines a with a . + /// + /// The wrapped . + /// The wrapped . + /// The type of the result produced by the . + public record class TesTaskTask(System.Threading.Tasks.Task Task, Tes.Models.TesTask TesTask) : System.IDisposable + { + /// Gets an awaiter used to await the . + /// An awaiter instance. + // https://devblogs.microsoft.com/pfxteam/await-anything/ + public System.Runtime.CompilerServices.TaskAwaiter GetAwaiter() + { + return Task.GetAwaiter(); + } + + /// + /// Disposes the member , releasing all of its unmanaged resources. + /// + /// + /// Unlike most of the members of the member , this method is not thread-safe. + /// Also, may only be called on a member that is in one of + /// the final states: RanToCompletion, + /// Faulted, or + /// Canceled. + /// + /// + /// The exception that is thrown if the member is not in + /// one of the final states: RanToCompletion, + /// Faulted, or + /// Canceled. + /// +#pragma warning disable CA1816 // Dispose methods should call SuppressFinalize + public void Dispose() +#pragma warning restore CA1816 // Dispose methods should call SuppressFinalize + { + Task.Dispose(); + } + } +} From ad6a77b9d0e0e51dfd1ea929621060fe24dccfd0 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 25 Oct 2023 20:03:20 -0700 Subject: [PATCH 048/202] formatting --- src/TesApi.Web/BatchPool.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 6dd23c481..fdf032597 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -313,7 +313,7 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc { // This method implememts a state machine to disable/enable autoscaling as needed to clear certain conditions that can be observed // Inputs are _resetAutoScalingRequired, compute nodes in ejectable states, and the current _scalingMode, along with the pool's - // allocation state and autoscale enablement. + // allocation state and autoscale enablement. // This method must no-op when the allocation state is not Steady var (allocationState, _, autoScaleEnabled, _, _, _, _) = await _azureProxy.GetFullAllocationStateAsync(Id, cancellationToken); From 263ab63bdc62c1a1a286aeb8b4e329019747002b Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 25 Oct 2023 23:59:48 -0700 Subject: [PATCH 049/202] Add output logs --- .../Commands/ProcessLauncherTests.cs | 2 +- src/Tes.Runner.Test/ExecutorTests.cs | 4 +- .../Transfer/BlobApiHttpUtilsTests.cs | 2 +- src/Tes.Runner/Events/EventsPublisher.cs | 9 +- src/Tes.Runner/Executor.cs | 10 ++- src/Tes.Runner/Transfer/BlobUploader.cs | 3 + src/TesApi.Tests/BatchSchedulerTests.cs | 7 +- src/TesApi.Web/AzureBatchTaskState.cs | 32 ++++++- src/TesApi.Web/BatchScheduler.cs | 27 ++++-- src/TesApi.Web/Events/NodeEventMessage.cs | 89 +++++++++++++------ .../Runner/TaskExecutionScriptingManager.cs | 6 +- 11 files changed, 140 insertions(+), 51 deletions(-) diff --git a/src/Tes.Runner.Test/Commands/ProcessLauncherTests.cs b/src/Tes.Runner.Test/Commands/ProcessLauncherTests.cs index aaa0b84fb..69faea19e 100644 --- a/src/Tes.Runner.Test/Commands/ProcessLauncherTests.cs +++ b/src/Tes.Runner.Test/Commands/ProcessLauncherTests.cs @@ -12,7 +12,7 @@ namespace Tes.Runner.Test.Commands public class ProcessLauncherTests { private ProcessLauncher processLauncher = null!; - private Mock streamLogReaderMock; + private Mock streamLogReaderMock = null!; [TestInitialize] public void SetUp() diff --git a/src/Tes.Runner.Test/ExecutorTests.cs b/src/Tes.Runner.Test/ExecutorTests.cs index 226a1882f..35c33d194 100644 --- a/src/Tes.Runner.Test/ExecutorTests.cs +++ b/src/Tes.Runner.Test/ExecutorTests.cs @@ -95,7 +95,7 @@ public async Task UploadOutputsAsync_NoOutputProvided_StartSuccessEventsAreCreat var result = await executor.UploadOutputsAsync(blobPipelineOptions); Assert.AreEqual(Executor.ZeroBytesTransferred, result); eventsPublisherMock.Verify(p => p.PublishUploadStartEventAsync(It.IsAny()), Times.Once); - eventsPublisherMock.Verify(p => p.PublishUploadEndEventAsync(It.IsAny(), 0, 0, EventsPublisher.SuccessStatus, string.Empty), Times.Once); + eventsPublisherMock.Verify(p => p.PublishUploadEndEventAsync(It.IsAny(), 0, 0, EventsPublisher.SuccessStatus, string.Empty, null), Times.Once); } [TestMethod] @@ -104,7 +104,7 @@ public async Task UploadOutputAsync_NullOptionsThrowsError_StartFailureEventsAre await Assert.ThrowsExceptionAsync(() => executor.UploadOutputsAsync(null!)); eventsPublisherMock.Verify(p => p.PublishUploadStartEventAsync(It.IsAny()), Times.Once); - eventsPublisherMock.Verify(p => p.PublishUploadEndEventAsync(It.IsAny(), 0, 0, EventsPublisher.FailedStatus, It.Is((c) => !string.IsNullOrEmpty(c))), Times.Once); + eventsPublisherMock.Verify(p => p.PublishUploadEndEventAsync(It.IsAny(), 0, 0, EventsPublisher.FailedStatus, It.Is((c) => !string.IsNullOrEmpty(c)), null), Times.Once); } [TestMethod] diff --git a/src/Tes.Runner.Test/Transfer/BlobApiHttpUtilsTests.cs b/src/Tes.Runner.Test/Transfer/BlobApiHttpUtilsTests.cs index 08af190a6..268500041 100644 --- a/src/Tes.Runner.Test/Transfer/BlobApiHttpUtilsTests.cs +++ b/src/Tes.Runner.Test/Transfer/BlobApiHttpUtilsTests.cs @@ -246,7 +246,7 @@ public async Task CreatePutBlobRequestAsync_ValidInput_ExpectedRequestIsCreated( } else { - Assert.AreEqual(expectedContent, await request?.Content?.ReadAsStringAsync()); + Assert.AreEqual(expectedContent, await request?.Content?.ReadAsStringAsync()!); } diff --git a/src/Tes.Runner/Events/EventsPublisher.cs b/src/Tes.Runner/Events/EventsPublisher.cs index ce73d3ff3..11cb9388d 100644 --- a/src/Tes.Runner/Events/EventsPublisher.cs +++ b/src/Tes.Runner/Events/EventsPublisher.cs @@ -93,7 +93,7 @@ public virtual async Task PublishUploadStartEventAsync(NodeTask nodeTask) await PublishAsync(eventMessage); } - public virtual async Task PublishUploadEndEventAsync(NodeTask nodeTask, int numberOfFiles, long totalSizeInBytes, string statusMessage, string? errorMessage = default) + public virtual async Task PublishUploadEndEventAsync(NodeTask nodeTask, int numberOfFiles, long totalSizeInBytes, string statusMessage, string? errorMessage = default, IEnumerable<(long Length, Uri? BlobUrl, string FileName)>? completedFiles = default) { var eventMessage = CreateNewEventMessage(nodeTask.Id, UploadEndEvent, statusMessage, nodeTask.WorkflowId); @@ -105,6 +105,13 @@ public virtual async Task PublishUploadEndEventAsync(NodeTask nodeTask, int numb { "errorMessage", errorMessage??string.Empty} }; + foreach (var (length, blobUrl, fileName, index) in completedFiles?.Select((item, index) => (item.Length, item.BlobUrl, item.FileName, index)) ?? Enumerable.Empty<(long, Uri?, string, int)>()) + { + eventMessage.EventData.Add($"fileSize-{index}", length.ToString()); + eventMessage.EventData.Add($"fileUri-{index}", blobUrl?.ToString() ?? string.Empty); + eventMessage.EventData.Add($"filePath-{index}", fileName); + } + await PublishAsync(eventMessage); } diff --git a/src/Tes.Runner/Executor.cs b/src/Tes.Runner/Executor.cs index a0f3aaca3..e629ba0e7 100644 --- a/src/Tes.Runner/Executor.cs +++ b/src/Tes.Runner/Executor.cs @@ -92,6 +92,8 @@ public async Task UploadOutputsAsync(BlobPipelineOptions blobPipelineOptio var bytesTransferred = ZeroBytesTransferred; var numberOfOutputs = 0; var errorMessage = string.Empty; + IEnumerable<(long Length, Uri? BlobUrl, string FileName)>? completedFiles = default; + try { await eventsPublisher.PublishUploadStartEventAsync(tesNodeTask); @@ -115,7 +117,7 @@ public async Task UploadOutputsAsync(BlobPipelineOptions blobPipelineOptio var optimizedOptions = OptimizeBlobPipelineOptionsForUpload(blobPipelineOptions, outputs); - bytesTransferred = await UploadOutputsAsync(optimizedOptions, outputs); + (bytesTransferred, completedFiles) = await UploadOutputsAsync(optimizedOptions, outputs); await AppendMetrics(tesNodeTask.OutputsMetricsFormat, bytesTransferred); @@ -130,11 +132,11 @@ public async Task UploadOutputsAsync(BlobPipelineOptions blobPipelineOptio } finally { - await eventsPublisher.PublishUploadEndEventAsync(tesNodeTask, numberOfOutputs, bytesTransferred, statusMessage, errorMessage); + await eventsPublisher.PublishUploadEndEventAsync(tesNodeTask, numberOfOutputs, bytesTransferred, statusMessage, errorMessage, completedFiles); } } - private async Task UploadOutputsAsync(BlobPipelineOptions blobPipelineOptions, List outputs) + private async Task<(long BytesTransferred, IEnumerable<(long length, Uri? blobUrl, string fileName)> CompletedFiles)> UploadOutputsAsync(BlobPipelineOptions blobPipelineOptions, List outputs) { var memoryBufferChannel = await MemoryBufferPoolFactory.CreateMemoryBufferPoolAsync(blobPipelineOptions.MemoryBufferCapacity, blobPipelineOptions.BlockSizeBytes); @@ -144,7 +146,7 @@ private async Task UploadOutputsAsync(BlobPipelineOptions blobPipelineOpti logger.LogInformation($"Executed Upload. Time elapsed: {executionResult.Elapsed} Bandwidth: {BlobSizeUtils.ToBandwidth(executionResult.Result, executionResult.Elapsed.TotalSeconds)} MiB/s"); - return executionResult.Result; + return (executionResult.Result, uploader.CompletedFiles); } private async Task?> CreateUploadOutputsAsync() diff --git a/src/Tes.Runner/Transfer/BlobUploader.cs b/src/Tes.Runner/Transfer/BlobUploader.cs index cb17e8136..623f80606 100644 --- a/src/Tes.Runner/Transfer/BlobUploader.cs +++ b/src/Tes.Runner/Transfer/BlobUploader.cs @@ -14,6 +14,8 @@ public class BlobUploader : BlobOperationPipeline { private readonly ConcurrentDictionary hashListProviders = new(); + internal readonly ConcurrentBag<(long length, Uri? blobUrl, string fileName)> CompletedFiles = new(); + public BlobUploader(BlobPipelineOptions pipelineOptions, Channel memoryBufferPool) : base(pipelineOptions, memoryBufferPool) { } @@ -134,6 +136,7 @@ public override async Task OnCompletionAsync(long length, Uri? blobUrl, string f } finally { + CompletedFiles.Add((length, blobUrl, fileName)); response?.Dispose(); } } diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 25cd28e6d..c8195b336 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -971,11 +971,12 @@ public async Task SuccessfullyCompletedTaskContainsBatchNodeMetrics() Assert.AreEqual(2, batchNodeMetrics.FileDownloadSizeInGB); Assert.AreEqual(4, batchNodeMetrics.FileUploadSizeInGB); - var executorLog = tesTask.GetOrAddTesTaskLog().GetOrAddExecutorLog(); + var taskLog = tesTask.GetOrAddTesTaskLog(); + var executorLog = taskLog.GetOrAddExecutorLog(); Assert.IsNotNull(executorLog); Assert.AreEqual(0, executorLog.ExitCode); - Assert.AreEqual(DateTimeOffset.Parse("2020-10-08T02:30:39+00:00"), executorLog.StartTime); - Assert.AreEqual(DateTimeOffset.Parse("2020-10-08T02:49:39+00:00"), executorLog.EndTime); + Assert.AreEqual(DateTimeOffset.Parse("2020-10-08T02:30:39+00:00"), taskLog.StartTime); + Assert.AreEqual(DateTimeOffset.Parse("2020-10-08T02:49:39+00:00"), taskLog.EndTime); }); } diff --git a/src/TesApi.Web/AzureBatchTaskState.cs b/src/TesApi.Web/AzureBatchTaskState.cs index a394d429f..ce096bbde 100644 --- a/src/TesApi.Web/AzureBatchTaskState.cs +++ b/src/TesApi.Web/AzureBatchTaskState.cs @@ -3,6 +3,7 @@ using System; using System.Collections.Generic; +using System.Linq; using static TesApi.Web.AzureBatchTaskState; namespace TesApi.Web @@ -15,9 +16,22 @@ namespace TesApi.Web /// Failure information. /// . /// . + /// + /// + /// /// . /// . - public record AzureBatchTaskState(TaskState State, IEnumerable OutputFileLogs = default, FailureInformation Failure = default, DateTimeOffset? CloudTaskCreationTime = default, DateTimeOffset? BatchTaskStartTime = default, DateTimeOffset? BatchTaskEndTime = default, int? BatchTaskExitCode = default) + public record AzureBatchTaskState( + TaskState State, + IEnumerable OutputFileLogs = default, + FailureInformation Failure = default, + DateTimeOffset? CloudTaskCreationTime = default, + DateTimeOffset? BatchTaskStartTime = default, + DateTimeOffset? ExecutorStartTime = default, + DateTimeOffset? ExecutorEndTime = default, + int? ExecutorExitCode = default, + DateTimeOffset? BatchTaskEndTime = default, + int? BatchTaskExitCode = default) { /// /// TesTask's state @@ -104,5 +118,21 @@ public record OutputFileLog(Uri Url, string Path, long Size); /// Failure code. Intended to be machine readable. See . /// Failure details to be added to . public record FailureInformation(string Reason, IEnumerable SystemLogs); + + /// + /// SystemLog appending constructor + /// + /// + /// + protected AzureBatchTaskState(AzureBatchTaskState other, string appendToSystemLog) + : this(other) + { + Failure = other.Failure switch + { + null => new("Unknown", Enumerable.Empty().Append(appendToSystemLog)), + { SystemLogs: null } => new(other.Failure.Reason ?? "Unknown", Enumerable.Empty().Append(appendToSystemLog)), + _ => new(other.Failure.Reason ?? "Unknown", other.Failure.SystemLogs.Append(appendToSystemLog)), + }; + } } } diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index fce803049..20b5db103 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -194,9 +194,11 @@ async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, Comb tesTaskLog.BatchNodeMetrics = batchNodeMetrics; tesTaskLog.CromwellResultCode = cromwellRcCode; - tesTaskLog.EndTime = DateTime.UtcNow; - tesTaskExecutorLog.StartTime ??= taskStartTime ?? batchInfo.BatchTaskStartTime; - tesTaskExecutorLog.EndTime ??= taskEndTime ?? batchInfo.BatchTaskEndTime; + tesTaskLog.EndTime ??= taskEndTime ?? batchInfo.BatchTaskEndTime; + tesTaskLog.StartTime ??= taskStartTime ?? batchInfo.BatchTaskStartTime; + tesTaskLog.Outputs ??= batchInfo.OutputFileLogs?.Select(entry => new Tes.Models.TesOutputFileLog { Path = entry.Path, SizeBytes = $"{entry.Size}", Url = entry.Url.AbsoluteUri }).ToList(); + tesTaskExecutorLog.StartTime ??= batchInfo.ExecutorEndTime; + tesTaskExecutorLog.EndTime ??= batchInfo.ExecutorEndTime; tesTaskExecutorLog.ExitCode ??= batchInfo.BatchTaskExitCode; // Only accurate when the task completes successfully, otherwise it's the Batch time as reported from Batch @@ -217,6 +219,11 @@ async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, Comb } } + if (!tesTask.IsActiveState()) + { + await taskExecutionScriptingManager.TryUploadServerTesTask(tesTask, "server-tes-task-completed.json", cancellationToken); + } + return true; } @@ -249,10 +256,9 @@ Task RequeueTaskAfterFailureAsync(TesTask tesTask, CombinedBatchTaskInfo b ? AddSystemLogAndSetTaskExecutorErrorAsync(tesTask, batchInfo, "System Error: Retry count exceeded.", cancellationToken) : SetTaskStateAfterFailureAsync(tesTask, TesState.QUEUEDEnum, batchInfo, cancellationToken); - Task AddSystemLogAndSetTaskExecutorErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, string alternateSystemLogItem, CancellationToken cancellationToken) + Task AddSystemLogAndSetTaskExecutorErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, string additionalSystemLogItem, CancellationToken cancellationToken) { - batchInfo = new(new(batchInfo.State, batchInfo.OutputFileLogs, new(batchInfo.Failure.Reason, (batchInfo.Failure.SystemLogs ?? Enumerable.Empty()).Append(alternateSystemLogItem)), batchInfo.CloudTaskCreationTime, batchInfo.BatchTaskStartTime, batchInfo.BatchTaskEndTime, batchInfo.BatchTaskExitCode), batchInfo.AlternateSystemLogItem); - return SetTaskExecutorError(tesTask, batchInfo, cancellationToken); + return SetTaskExecutorError(tesTask, new(batchInfo, additionalSystemLogItem), cancellationToken); } Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) @@ -302,6 +308,7 @@ private async Task DeleteCancelledTaskAsync(TesTask tesTask, CancellationT await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); tesTask.IsTaskDeletionRequired = false; + await taskExecutionScriptingManager.TryUploadServerTesTask(tesTask, "server-tes-task-completed.json", cancellationToken); return true; } @@ -1425,8 +1432,14 @@ public async ValueTask ActionAsync(TesTask tesTask, AzureBatchTaskState ba private record CombinedBatchTaskInfo : AzureBatchTaskState { + public CombinedBatchTaskInfo(CombinedBatchTaskInfo state, string additionalSystemLogItem) + : base(state, additionalSystemLogItem) + { + AlternateSystemLogItem = state.AlternateSystemLogItem; + } + public CombinedBatchTaskInfo(AzureBatchTaskState state, string alternateSystemLogItem) - : base(state.State, state.OutputFileLogs, state.Failure, state.CloudTaskCreationTime, state.BatchTaskStartTime, state.BatchTaskEndTime, state.BatchTaskExitCode) + : base(state) { AlternateSystemLogItem = alternateSystemLogItem; } diff --git a/src/TesApi.Web/Events/NodeEventMessage.cs b/src/TesApi.Web/Events/NodeEventMessage.cs index 8f5520ad1..ef70ccdee 100644 --- a/src/TesApi.Web/Events/NodeEventMessage.cs +++ b/src/TesApi.Web/Events/NodeEventMessage.cs @@ -12,7 +12,7 @@ namespace TesApi.Web.Events { /// - /// Represents the events sent by the node task runner. + /// Represents an event sent by the node task runner. /// public class NodeEventMessage { @@ -159,71 +159,104 @@ await _azureProxy.SetBlobTags( cancellationToken); } - private /*static*/ AzureBatchTaskState GetBatchTaskState(Tes.Runner.Events.EventMessage message) + private AzureBatchTaskState GetBatchTaskState(Tes.Runner.Events.EventMessage message) { return (message.Name ?? Event) switch { - Tes.Runner.Events.EventsPublisher.DownloadStartEvent => new(AzureBatchTaskState.TaskState.NoChange), + Tes.Runner.Events.EventsPublisher.DownloadStartEvent => new(AzureBatchTaskState.TaskState.NoChange, + BatchTaskStartTime: message.Created), - Tes.Runner.Events.EventsPublisher.DownloadEndEvent => string.IsNullOrWhiteSpace(message.EventData["errorMessage"]) + Tes.Runner.Events.EventsPublisher.DownloadEndEvent => message.StatusMessage switch + { + Tes.Runner.Events.EventsPublisher.SuccessStatus => new(AzureBatchTaskState.TaskState.NoChange), - ? new( - AzureBatchTaskState.TaskState.NoChange) - - : new( + Tes.Runner.Events.EventsPublisher.FailedStatus => new( AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed, Failure: new("SystemError", Enumerable.Empty() + .Append("Download failed.") .Append(message.EventData["errorMessage"]))), - Tes.Runner.Events.EventsPublisher.ExecutorStartEvent => new(AzureBatchTaskState.TaskState.Running, BatchTaskStartTime: message.Created), + _ => throw new System.Diagnostics.UnreachableException(), + }, - Tes.Runner.Events.EventsPublisher.ExecutorEndEvent => string.IsNullOrWhiteSpace(message.EventData["errorMessage"]) + Tes.Runner.Events.EventsPublisher.ExecutorStartEvent => new(AzureBatchTaskState.TaskState.Running, + ExecutorStartTime: message.Created), - ? new( + Tes.Runner.Events.EventsPublisher.ExecutorEndEvent => message.StatusMessage switch + { + Tes.Runner.Events.EventsPublisher.SuccessStatus => new( AzureBatchTaskState.TaskState.InfoUpdate, - BatchTaskEndTime: message.Created, - BatchTaskExitCode: int.Parse(message.EventData["exitCode"])) + ExecutorEndTime: message.Created, + ExecutorExitCode: int.Parse(message.EventData["exitCode"])), - : new( + Tes.Runner.Events.EventsPublisher.FailedStatus => new( AzureBatchTaskState.TaskState.InfoUpdate, Failure: new("ExecutorError", Enumerable.Empty() .Append(message.EventData["errorMessage"])), - BatchTaskEndTime: message.Created, - BatchTaskExitCode: int.Parse(message.EventData["exitCode"])), + ExecutorEndTime: message.Created, + ExecutorExitCode: int.Parse(message.EventData["exitCode"])), - Tes.Runner.Events.EventsPublisher.UploadStartEvent => new(AzureBatchTaskState.TaskState.NoChange), + _ => throw new System.Diagnostics.UnreachableException(), + }, - Tes.Runner.Events.EventsPublisher.UploadEndEvent => string.IsNullOrWhiteSpace(message.EventData["errorMessage"]) + Tes.Runner.Events.EventsPublisher.UploadStartEvent => new(AzureBatchTaskState.TaskState.NoChange), - ? new( - AzureBatchTaskState.TaskState.NoChange) + Tes.Runner.Events.EventsPublisher.UploadEndEvent => message.StatusMessage switch + { + Tes.Runner.Events.EventsPublisher.SuccessStatus => new( + AzureBatchTaskState.TaskState.InfoUpdate, + OutputFileLogs: GetFileLogs(message.EventData)), - : new( + Tes.Runner.Events.EventsPublisher.FailedStatus => new( AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed, Failure: new("SystemError", Enumerable.Empty() - .Append(message.EventData["errorMessage"]))), // TODO + .Append("Upload failed.") + .Append(message.EventData["errorMessage"]))), - Tes.Runner.Events.EventsPublisher.TaskCompletionEvent => string.IsNullOrWhiteSpace(message.EventData["errorMessage"]) + _ => throw new System.Diagnostics.UnreachableException(), + }, - ? new( + Tes.Runner.Events.EventsPublisher.TaskCompletionEvent => message.StatusMessage switch + { + Tes.Runner.Events.EventsPublisher.SuccessStatus => new( AzureBatchTaskState.TaskState.CompletedSuccessfully, BatchTaskStartTime: message.Created - TimeSpan.Parse(message.EventData["duration"]), - BatchTaskEndTime: message.Created/*, - BatchTaskExitCode: 0*/) + BatchTaskEndTime: message.Created), - : new( + Tes.Runner.Events.EventsPublisher.FailedStatus => new( AzureBatchTaskState.TaskState.CompletedWithErrors, - Failure: new("ExecutorError", + Failure: new("SystemError", Enumerable.Empty() + .Append("Node script failed.") .Append(message.EventData["errorMessage"])), BatchTaskStartTime: message.Created - TimeSpan.Parse(message.EventData["duration"]), BatchTaskEndTime: message.Created), + _ => throw new System.Diagnostics.UnreachableException(), + }, + _ => throw new System.Diagnostics.UnreachableException(), }; } + + private static IEnumerable GetFileLogs(IDictionary eventData) + { + if (eventData is null) + { + yield break; + } + + var numberOfFiles = int.Parse(eventData["numberOfFiles"]); + for (var i = 0; i < numberOfFiles; ++i) + { + yield return new( + new Uri(eventData[$"fileUri-{i}"]), + eventData[$"filePath-{i}"], + long.Parse(eventData[$"fileSize-{i}"])); + } + } } } diff --git a/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs b/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs index 0cc3782a7..ea508e429 100644 --- a/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs +++ b/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs @@ -57,7 +57,7 @@ public async Task PrepareBatchScriptAsync(TesTask tesTask { try { - await TryUploadServerTesTask(tesTask, cancellationToken); + await TryUploadServerTesTask(tesTask, "server-tes-task.json", cancellationToken); var nodeTaskUrl = await CreateAndUploadNodeTaskAsync(tesTask, nodeTaskConversionOptions, cancellationToken); @@ -73,7 +73,7 @@ public async Task PrepareBatchScriptAsync(TesTask tesTask } } - private async Task TryUploadServerTesTask(TesTask tesTask, CancellationToken cancellationToken) + public async Task TryUploadServerTesTask(TesTask tesTask, string blobName, CancellationToken cancellationToken) { try { @@ -84,7 +84,7 @@ private async Task TryUploadServerTesTask(TesTask tesTask, CancellationToken can DefaultValueHandling = DefaultValueHandling.Ignore }); - await UploadContentAsBlobToInternalTesLocationAsync(tesTask, severTesTaskContent, "server-tes-task.json", + await UploadContentAsBlobToInternalTesLocationAsync(tesTask, severTesTaskContent, blobName, cancellationToken); } catch (Exception e) From dba4464b0babffcd0c879a1853ac55a36be5aac4 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 26 Oct 2023 13:50:55 -0700 Subject: [PATCH 050/202] Address azure api failures --- src/TesApi.Web/AzureProxy.cs | 2 +- src/TesApi.Web/BatchPool.cs | 2 +- .../Storage/DefaultStorageAccessProvider.cs | 12 +++++++----- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 461206f6f..1bfc465b6 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -402,7 +402,7 @@ public async Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, Ca { var directory = (new BlobClient(directoryUri, new(BlobClientOptions.ServiceVersion.V2021_04_10))); return directory.GetParentBlobContainerClient() - .GetBlobsAsync(prefix: directory.Name + "/", cancellationToken: cancellationToken) + .GetBlobsAsync(prefix: directory.Name.TrimEnd('/') + "/", cancellationToken: cancellationToken) .Select(blobItem => (blobItem.Name, new BlobUriBuilder(directory.Uri) { Sas = null, BlobName = blobItem.Name }.ToUri())); } diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index fdf032597..da2e1e730 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -658,7 +658,7 @@ TaskFailureInformation PopNextStartTaskFailure() /// public IAsyncEnumerable GetCompletedTasks(CancellationToken _1) - => GetTasksAsync("id,executionInfo", $"state eq 'completed' and stateTransitionTime lt '{DateTime.UtcNow - TimeSpan.FromMinutes(2):O}'"); + => GetTasksAsync("id,executionInfo", $"state eq 'completed' and stateTransitionTime lt DateTime'{DateTime.UtcNow - TimeSpan.FromMinutes(2):O}'"); /// public async ValueTask GetAllocationStateTransitionTime(CancellationToken cancellationToken = default) diff --git a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs index 36507ac0c..2ed5c2ce8 100644 --- a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs @@ -103,7 +103,7 @@ public override async Task MapLocalPathToSasUrlAsync(string path, BlobSa //TODO: refactor this to throw an exception instead of logging and error and returning null. if (!StorageAccountUrlSegments.TryCreate(path, out var pathSegments)) { - Logger.LogError($"Could not parse path '{path}'."); + Logger.LogError("Could not parse path '{UnparsablePath}'.", path); return null; } @@ -131,17 +131,19 @@ private Task AddSasTokenAsync(StorageAccountUrlSegmen { if (pathSegments.IsContainer) { - throw new ArgumentException(nameof(blobPermissions), "BlobContainerSasPermissions must be used with containers."); + throw new ArgumentException("BlobContainerSasPermissions must be used with containers.", nameof(blobPermissions)); } - return AddSasTokenAsyncImpl(pathSegments, sasTokenDuration, expiresOn => new BlobSasBuilder(blobPermissions, expiresOn), path, cancellationToken); + return blobPermissions.HasFlag(BlobSasPermissions.List) + ? AddSasTokenAsyncImpl(pathSegments, sasTokenDuration, expiresOn => new BlobSasBuilder(ConvertSasPermissions(blobPermissions, nameof(blobPermissions)), expiresOn), path, cancellationToken) + : AddSasTokenAsyncImpl(pathSegments, sasTokenDuration, expiresOn => new BlobSasBuilder(blobPermissions, expiresOn), path, cancellationToken); } private Task AddSasTokenAsync(StorageAccountUrlSegments pathSegments, TimeSpan? sasTokenDuration, BlobContainerSasPermissions containerPermissions, CancellationToken cancellationToken, string path = default) { if (!pathSegments.IsContainer) { - throw new ArgumentException(nameof(containerPermissions), "BlobSasPermissions must be used with blobs."); + throw new ArgumentException("BlobSasPermissions must be used with blobs.", nameof(containerPermissions)); } return AddSasTokenAsyncImpl(pathSegments, sasTokenDuration, expiresOn => new BlobSasBuilder(containerPermissions, expiresOn), path, cancellationToken); @@ -205,7 +207,7 @@ private static BlobContainerSasPermissions ConvertSasPermissions(BlobSasPermissi if (sasPermissions.HasFlag(BlobSasPermissions.Move)) { result |= BlobContainerSasPermissions.Move; } if (sasPermissions.HasFlag(BlobSasPermissions.Execute)) { result |= BlobContainerSasPermissions.Execute; } if (sasPermissions.HasFlag(BlobSasPermissions.SetImmutabilityPolicy)) { result |= BlobContainerSasPermissions.SetImmutabilityPolicy; } - if (sasPermissions.HasFlag(BlobSasPermissions.PermanentDelete)) { throw new ArgumentOutOfRangeException(paramName, nameof(BlobSasPermissions.PermanentDelete), "Permission that cannot be applied to container was provided."); } + if (sasPermissions.HasFlag(BlobSasPermissions.PermanentDelete)) { throw new ArgumentOutOfRangeException(paramName, nameof(BlobSasPermissions.PermanentDelete), "A permission that cannot be applied to a container was provided when a container SAS was required."); } return result; } From 22f0a1d382d0d41b48736927e56f4e2ba894864d Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 26 Oct 2023 16:02:03 -0700 Subject: [PATCH 051/202] Fix list in blobs --- .../Storage/DefaultStorageAccessProvider.cs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs index 2ed5c2ce8..147d28f8e 100644 --- a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs @@ -134,9 +134,7 @@ private Task AddSasTokenAsync(StorageAccountUrlSegmen throw new ArgumentException("BlobContainerSasPermissions must be used with containers.", nameof(blobPermissions)); } - return blobPermissions.HasFlag(BlobSasPermissions.List) - ? AddSasTokenAsyncImpl(pathSegments, sasTokenDuration, expiresOn => new BlobSasBuilder(ConvertSasPermissions(blobPermissions, nameof(blobPermissions)), expiresOn), path, cancellationToken) - : AddSasTokenAsyncImpl(pathSegments, sasTokenDuration, expiresOn => new BlobSasBuilder(blobPermissions, expiresOn), path, cancellationToken); + return AddSasTokenAsyncImpl(pathSegments, sasTokenDuration, (expiresOn, blobName) => new BlobSasBuilder(blobPermissions, expiresOn) { BlobName = blobPermissions.HasFlag(BlobSasPermissions.List) ? string.Empty : blobName }, path, cancellationToken); } private Task AddSasTokenAsync(StorageAccountUrlSegments pathSegments, TimeSpan? sasTokenDuration, BlobContainerSasPermissions containerPermissions, CancellationToken cancellationToken, string path = default) @@ -146,10 +144,10 @@ private Task AddSasTokenAsync(StorageAccountUrlSegmen throw new ArgumentException("BlobSasPermissions must be used with blobs.", nameof(containerPermissions)); } - return AddSasTokenAsyncImpl(pathSegments, sasTokenDuration, expiresOn => new BlobSasBuilder(containerPermissions, expiresOn), path, cancellationToken); + return AddSasTokenAsyncImpl(pathSegments, sasTokenDuration, (expiresOn, _1) => new BlobSasBuilder(containerPermissions, expiresOn) { BlobName = string.Empty }, path, cancellationToken); } - private async Task AddSasTokenAsyncImpl(StorageAccountUrlSegments pathSegments, TimeSpan? sasTokenDuration, Func createBuilder, string path, CancellationToken cancellationToken) + private async Task AddSasTokenAsyncImpl(StorageAccountUrlSegments pathSegments, TimeSpan? sasTokenDuration, Func createBuilder, string path, CancellationToken cancellationToken) { StorageAccountInfo storageAccountInfo = null; @@ -165,10 +163,9 @@ private async Task AddSasTokenAsyncImpl(StorageAccoun var resultPathSegments = new StorageAccountUrlSegments(storageAccountInfo.BlobEndpoint, pathSegments.ContainerName, pathSegments.BlobName); var expiresOn = DateTimeOffset.UtcNow.Add((sasTokenDuration ?? TimeSpan.Zero) + SasTokenDuration); - var builder = createBuilder(expiresOn); + var builder = createBuilder(expiresOn, resultPathSegments.BlobName); builder.BlobContainerName = resultPathSegments.ContainerName; - builder.BlobName = resultPathSegments.BlobName; builder.Protocol = SasProtocol.Https; resultPathSegments.SasToken = builder.ToSasQueryParameters(new StorageSharedKeyCredential(storageAccountInfo.Name, accountKey)).ToString(); From 1e88661dca936a29a62a6b62932ddf39d8efc437 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 26 Oct 2023 17:42:25 -0700 Subject: [PATCH 052/202] Cleanup --- src/TesApi.Tests/BatchSchedulerTests.cs | 119 +++++++-------- src/TesApi.Web/AzureBatchTaskState.cs | 6 +- src/TesApi.Web/BatchPool.cs | 27 +++- src/TesApi.Web/BatchPoolService.cs | 14 +- src/TesApi.Web/BatchScheduler.cs | 140 +++++++++--------- src/TesApi.Web/IBatchPool.cs | 15 +- .../Management/ArmBatchQuotaProvider.cs | 5 +- .../Management/BatchQuotaVerifier.cs | 13 +- .../Management/IBatchQuotaProvider.cs | 9 +- .../Management/IBatchQuotaVerifier.cs | 13 +- .../Management/TerraQuotaProvider.cs | 5 +- .../OrchestrateOnBatchSchedulerServiceBase.cs | 4 +- src/TesApi.Web/Scheduler.cs | 7 +- .../Storage/DefaultStorageAccessProvider.cs | 11 +- 14 files changed, 224 insertions(+), 164 deletions(-) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index c8195b336..139afd2bb 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -24,6 +24,7 @@ using TesApi.Web.Management; using TesApi.Web.Management.Models.Quotas; using TesApi.Web.Storage; +using static TesApi.Web.Management.IBatchQuotaVerifier; namespace TesApi.Tests { @@ -41,7 +42,7 @@ public async Task LocalPoolCacheAccessesNewPoolsAfterAllPoolsRemovedWithSameKey( Assert.IsTrue(batchScheduler.RemovePoolFromList(pool)); Assert.AreEqual(0, batchScheduler.GetPoolGroupKeys().Count()); - pool = (BatchPool)await batchScheduler.GetOrAddPoolAsync(key, false, (id, cancellationToken) => ValueTask.FromResult(new Pool(name: id)), System.Threading.CancellationToken.None); + pool = (BatchPool)await batchScheduler.GetOrAddPoolAsync(key, false, (id, cancellationToken) => ValueTask.FromResult(new Pool(name: id)), CancellationToken.None); Assert.IsNotNull(pool); Assert.AreEqual(1, batchScheduler.GetPoolGroupKeys().Count()); @@ -58,16 +59,16 @@ public async Task GetOrAddDoesNotAddExistingAvailablePool() var keyCount = batchScheduler.GetPoolGroupKeys().Count(); var key = batchScheduler.GetPoolGroupKeys().First(); var count = batchScheduler.GetPools().Count(); - serviceProvider.AzureProxy.Verify(mock => mock.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny()), Times.Once); + serviceProvider.AzureProxy.Verify(mock => mock.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny()), Times.Once); - var pool = await batchScheduler.GetOrAddPoolAsync(key, false, (id, cancellationToken) => ValueTask.FromResult(new Pool(name: id)), System.Threading.CancellationToken.None); - await foreach (var _ in pool.ServicePoolAsync()) { } + var pool = await batchScheduler.GetOrAddPoolAsync(key, false, (id, cancellationToken) => ValueTask.FromResult(new Pool(name: id)), CancellationToken.None); + await pool.ServicePoolAsync(); Assert.AreEqual(count, batchScheduler.GetPools().Count()); Assert.AreEqual(keyCount, batchScheduler.GetPoolGroupKeys().Count()); //Assert.AreSame(info, pool); Assert.AreEqual(info.Id, pool.Id); - serviceProvider.AzureProxy.Verify(mock => mock.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny()), Times.Once); + serviceProvider.AzureProxy.Verify(mock => mock.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny()), Times.Once); } [TestMethod] @@ -82,8 +83,8 @@ public async Task GetOrAddDoesAddWithExistingUnavailablePool() var key = batchScheduler.GetPoolGroupKeys().First(); var count = batchScheduler.GetPools().Count(); - var pool = await batchScheduler.GetOrAddPoolAsync(key, false, (id, cancellationToken) => ValueTask.FromResult(new Pool(name: id)), System.Threading.CancellationToken.None); - await foreach (var _ in pool.ServicePoolAsync()) { } + var pool = await batchScheduler.GetOrAddPoolAsync(key, false, (id, cancellationToken) => ValueTask.FromResult(new Pool(name: id)), CancellationToken.None); + await pool.ServicePoolAsync(); Assert.AreNotEqual(count, batchScheduler.GetPools().Count()); Assert.AreEqual(keyCount, batchScheduler.GetPoolGroupKeys().Count()); @@ -220,7 +221,7 @@ public async Task TestIfVmSizeIsAvailable(string vmSize, bool preemptible) GetMockAllowedVms(config)); var batchScheduler = serviceProvider.GetT(); - var size = await ((BatchScheduler)batchScheduler).GetVmSizeAsync(task, System.Threading.CancellationToken.None); + var size = await ((BatchScheduler)batchScheduler).GetVmSizeAsync(task, CancellationToken.None); GuardAssertsWithTesTask(task, () => Assert.AreEqual(vmSize, size.VmSize)); } @@ -356,8 +357,8 @@ public Task AddBatchTaskHandlesAzureBatchPoolCreationExceptionViaJobCreation() return AddBatchTaskHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) - => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchJobAsync(It.IsAny(), It.IsAny())) - .Callback((_1, _2) + => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchJobAsync(It.IsAny(), It.IsAny())) + .Callback((_1, _2) => throw new Microsoft.Rest.Azure.CloudException("No job for you.") { Body = new() { Code = BatchErrorCodeStrings.OperationTimedOut } })); void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception exception)> logs) @@ -379,8 +380,8 @@ public Task AddBatchTaskHandlesAzureBatchPoolCreationExceptionViaPoolCreation() return AddBatchTaskHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) - => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) - .Callback((_1, _2, _3) + => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) + .Callback((_1, _2, _3) => throw new Microsoft.Rest.Azure.CloudException("No job for you.") { Body = new() { Code = BatchErrorCodeStrings.OperationTimedOut } })); void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception exception)> logs) @@ -468,8 +469,8 @@ public Task AddBatchTaskHandlesTesException() return AddBatchTaskHandlesExceptions(TesState.SYSTEMERROREnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) - => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) - .Callback((poolInfo, isPreemptible, cancellationToken) + => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) + .Callback((poolInfo, isPreemptible, cancellationToken) => throw new TesException("TestFailureReason"))); void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception exception)> logs) @@ -491,8 +492,8 @@ public Task AddBatchTaskHandlesBatchClientException() return AddBatchTaskHandlesExceptions(TesState.SYSTEMERROREnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) - => (default, azureProxy => azureProxy.Setup(b => b.AddBatchTaskAsync(It.IsAny(), It.IsAny(), It.IsAny(), It.IsAny())) - .Callback((_1, _2, _3, _4) + => (default, azureProxy => azureProxy.Setup(b => b.AddBatchTaskAsync(It.IsAny(), It.IsAny(), It.IsAny(), It.IsAny())) + .Callback((_1, _2, _3, _4) => throw typeof(BatchClientException) .GetConstructor(System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance, new[] { typeof(string), typeof(Exception) }) @@ -517,8 +518,8 @@ public Task AddBatchTaskHandlesBatchExceptionForJobQuota() return AddBatchTaskHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) - => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchJobAsync(It.IsAny(), It.IsAny())) - .Callback((_1, _2) + => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchJobAsync(It.IsAny(), It.IsAny())) + .Callback((_1, _2) => throw new BatchException( new Mock().Object, default, @@ -542,8 +543,8 @@ public Task AddBatchTaskHandlesBatchExceptionForPoolQuota() return AddBatchTaskHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) - => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) - .Callback((poolInfo, isPreemptible, cancellationToken) + => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) + .Callback((poolInfo, isPreemptible, cancellationToken) => throw new BatchException( new Mock().Object, default, @@ -567,8 +568,8 @@ public Task AddBatchTaskHandlesCloudExceptionForPoolQuota() return AddBatchTaskHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) - => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) - .Callback((poolInfo, isPreemptible, cancellationToken) + => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) + .Callback((poolInfo, isPreemptible, cancellationToken) => throw new Microsoft.Rest.Azure.CloudException() { Body = new() { Code = "AutoPoolCreationFailedWithQuotaReached", Message = "No autopool for you." } })); void Validator(TesTask task, IEnumerable<(LogLevel logLevel, Exception exception)> logs) @@ -588,7 +589,7 @@ public Task AddBatchTaskHandlesUnknownException() { var exceptionMsg = "Successful Test"; var batchQuotaProvider = new Mock(); - batchQuotaProvider.Setup(p => p.GetVmCoreQuotaAsync(It.IsAny(), It.IsAny())).Callback((lowPriority, _1) => throw new InvalidOperationException(exceptionMsg)); + batchQuotaProvider.Setup(p => p.GetVmCoreQuotaAsync(It.IsAny(), It.IsAny())).Callback((lowPriority, _1) => throw new InvalidOperationException(exceptionMsg)); return AddBatchTaskHandlesExceptions(TesState.SYSTEMERROREnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) @@ -622,7 +623,7 @@ public async Task BatchJobContainsExpectedBatchPoolInformation() GetMockAllowedVms(config)); var batchScheduler = serviceProvider.GetT(); - await foreach (var _ in batchScheduler.ProcessQueuedTesTasksAsync(new[] { tesTask }, System.Threading.CancellationToken.None)) { } + await foreach (var _ in batchScheduler.ProcessQueuedTesTasksAsync(new[] { tesTask }, CancellationToken.None)) { } var createBatchPoolAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.CreateBatchPoolAsync)); var addBatchTaskAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.AddBatchTaskAsync)); @@ -899,7 +900,7 @@ public async Task TaskGetsCancelled() { Assert.AreEqual(TesState.CANCELEDEnum, tesTask.State); Assert.IsTrue(tesTask.IsTaskDeletionRequired); - azureProxy.Verify(i => i.TerminateBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); + azureProxy.Verify(i => i.TerminateBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); }); } @@ -922,7 +923,7 @@ public async Task CancelledTaskGetsDeleted() GuardAssertsWithTesTask(tesTask, () => { Assert.IsFalse(tesTask.IsTaskDeletionRequired); - azureProxy.Verify(i => i.DeleteBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); + azureProxy.Verify(i => i.DeleteBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); }); } @@ -1465,7 +1466,7 @@ public async Task LocalFilesInCromwellTmpDirectoryAreDiscoveredAndUploaded() var inputFileUrl = filesToDownload.SingleOrDefault(f => f.StorageUrl.StartsWith("https://defaultstorageaccount.blob.core.windows.net/tes-internal/") && f.LocalPath.Equals("%AZ_BATCH_TASK_WORKING_DIR%/wd/cromwell-executions/workflowpath/inputs/blob1"))?.StorageUrl; Assert.IsNotNull(inputFileUrl); azureProxy.Verify(i => i.LocalFileExists("/cromwell-tmp/tmp12345/blob1")); - azureProxy.Verify(i => i.UploadBlobFromFileAsync(It.Is(uri => uri.AbsoluteUri.StartsWith($"{new Uri(inputFileUrl).GetLeftPart(UriPartial.Path)}?sv=")), "/cromwell-tmp/tmp12345/blob1", It.IsAny())); + azureProxy.Verify(i => i.UploadBlobFromFileAsync(It.Is(uri => uri.AbsoluteUri.StartsWith($"{new Uri(inputFileUrl).GetLeftPart(UriPartial.Path)}?sv=")), "/cromwell-tmp/tmp12345/blob1", It.IsAny())); }); } @@ -1538,11 +1539,11 @@ public async Task PoolIsCreatedWithoutPublicIpWhenSubnetAndDisableBatchNodesPubl if (azureProxyReturnValues.BatchTaskState is null) { - await foreach (var _ in batchScheduler.ProcessQueuedTesTasksAsync(new[] { tesTask }, System.Threading.CancellationToken.None)) { } + await foreach (var _ in batchScheduler.ProcessQueuedTesTasksAsync(new[] { tesTask }, CancellationToken.None)) { } } else { - await foreach (var _ in batchScheduler.ProcessTesTaskBatchStatesAsync(new[] { tesTask }, new[] { azureProxyReturnValues.BatchTaskState }, System.Threading.CancellationToken.None)) { } + await foreach (var _ in batchScheduler.ProcessTesTaskBatchStatesAsync(new[] { tesTask }, new[] { azureProxyReturnValues.BatchTaskState }, CancellationToken.None)) { } } var createBatchPoolAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.CreateBatchPoolAsync)); @@ -1565,14 +1566,14 @@ private static Action> GetMockAllowedVms(IEnumerabl { allowedVms = allowedVmsConfig.Split(",").ToList(); } - proxy.Setup(p => p.GetAllowedVmSizes(It.IsAny())) + proxy.Setup(p => p.GetAllowedVmSizes(It.IsAny())) .ReturnsAsync(allowedVms); }); private static Action> GetMockSkuInfoProvider(AzureProxyReturnValues azureProxyReturnValues) => new(proxy => - proxy.Setup(p => p.GetVmSizesAndPricesAsync(It.IsAny(), It.IsAny())) + proxy.Setup(p => p.GetVmSizesAndPricesAsync(It.IsAny(), It.IsAny())) .ReturnsAsync(azureProxyReturnValues.VmSizesAndPrices)); private static Action> GetMockQuotaProvider(AzureProxyReturnValues azureProxyReturnValues) @@ -1582,14 +1583,14 @@ private static Action> GetMockQuotaProvider(AzureProxy var vmFamilyQuota = batchQuotas.DedicatedCoreQuotaPerVMFamily?.FirstOrDefault(v => string.Equals(v.Name, "VmFamily1", StringComparison.InvariantCultureIgnoreCase))?.CoreQuota ?? 0; quotaProvider.Setup(p => - p.GetQuotaForRequirementAsync(It.IsAny(), It.Is(p => p == false), It.IsAny(), It.IsAny())) + p.GetQuotaForRequirementAsync(It.IsAny(), It.Is(p => p == false), It.IsAny(), It.IsAny())) .ReturnsAsync(() => new BatchVmFamilyQuotas(batchQuotas.DedicatedCoreQuota, vmFamilyQuota, batchQuotas.PoolQuota, batchQuotas.ActiveJobAndJobScheduleQuota, batchQuotas.DedicatedCoreQuotaPerVMFamilyEnforced, "VmSize1")); quotaProvider.Setup(p => - p.GetQuotaForRequirementAsync(It.IsAny(), It.Is(p => p == true), It.IsAny(), It.IsAny())) + p.GetQuotaForRequirementAsync(It.IsAny(), It.Is(p => p == true), It.IsAny(), It.IsAny())) .ReturnsAsync(() => new BatchVmFamilyQuotas(batchQuotas.LowPriorityCoreQuota, vmFamilyQuota, batchQuotas.PoolQuota, @@ -1597,14 +1598,14 @@ private static Action> GetMockQuotaProvider(AzureProxy batchQuotas.DedicatedCoreQuotaPerVMFamilyEnforced, "VmSize1")); quotaProvider.Setup(p => - p.GetVmCoreQuotaAsync(It.Is(l => l == true), It.IsAny())) + p.GetVmCoreQuotaAsync(It.Is(l => l == true), It.IsAny())) .ReturnsAsync(new BatchVmCoreQuota(batchQuotas.LowPriorityCoreQuota, true, batchQuotas.DedicatedCoreQuotaPerVMFamilyEnforced, batchQuotas.DedicatedCoreQuotaPerVMFamily?.Select(v => new BatchVmCoresPerFamily(v.Name, v.CoreQuota)).ToList(), new(batchQuotas.ActiveJobAndJobScheduleQuota, batchQuotas.PoolQuota, batchQuotas.DedicatedCoreQuota, batchQuotas.LowPriorityCoreQuota))); quotaProvider.Setup(p => - p.GetVmCoreQuotaAsync(It.Is(l => l == false), It.IsAny())) + p.GetVmCoreQuotaAsync(It.Is(l => l == false), It.IsAny())) .ReturnsAsync(new BatchVmCoreQuota(batchQuotas.DedicatedCoreQuota, false, batchQuotas.DedicatedCoreQuotaPerVMFamilyEnforced, @@ -1650,7 +1651,7 @@ private static Action> GetContainerRegistryInfoP AzureProxyReturnValues azureProxyReturnValues) => containerRegistryProvider => { - containerRegistryProvider.Setup(p => p.GetContainerRegistryInfoAsync("registryServer1.io/imageName1:tag1", It.IsAny())) + containerRegistryProvider.Setup(p => p.GetContainerRegistryInfoAsync("registryServer1.io/imageName1:tag1", It.IsAny())) .Returns(Task.FromResult(azureProxyReturnValues.ContainerRegistryInfo)); }; @@ -1663,13 +1664,13 @@ private static Action> GetMockAzureProxy(AzureProxyReturnValue azureProxy.Setup(a => a.GetActivePoolsAsync(It.IsAny())) .Returns(AsyncEnumerable.Empty()); - azureProxy.Setup(a => a.GetStorageAccountInfoAsync("defaultstorageaccount", It.IsAny())) + azureProxy.Setup(a => a.GetStorageAccountInfoAsync("defaultstorageaccount", It.IsAny())) .Returns(Task.FromResult(azureProxyReturnValues.StorageAccountInfos["defaultstorageaccount"])); - azureProxy.Setup(a => a.GetStorageAccountInfoAsync("storageaccount1", It.IsAny())) + azureProxy.Setup(a => a.GetStorageAccountInfoAsync("storageaccount1", It.IsAny())) .Returns(Task.FromResult(azureProxyReturnValues.StorageAccountInfos["storageaccount1"])); - azureProxy.Setup(a => a.GetStorageAccountKeyAsync(It.IsAny(), It.IsAny())) + azureProxy.Setup(a => a.GetStorageAccountKeyAsync(It.IsAny(), It.IsAny())) .Returns(Task.FromResult(azureProxyReturnValues.StorageAccountKey)); azureProxy.Setup(a => a.GetBatchActiveNodeCountByVmSize()) @@ -1681,19 +1682,19 @@ private static Action> GetMockAzureProxy(AzureProxyReturnValue azureProxy.Setup(a => a.GetBatchActivePoolCount()) .Returns(azureProxyReturnValues.ActivePoolCount); - azureProxy.Setup(a => a.GetBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) - .Returns((string id, System.Threading.CancellationToken cancellationToken, DetailLevel detailLevel) => Task.FromResult(azureProxyReturnValues.GetBatchPoolImpl(id))); + azureProxy.Setup(a => a.GetBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) + .Returns((string id, CancellationToken cancellationToken, DetailLevel detailLevel) => Task.FromResult(azureProxyReturnValues.GetBatchPoolImpl(id))); - azureProxy.Setup(a => a.DownloadBlobAsync(It.IsAny(), It.IsAny())) + azureProxy.Setup(a => a.DownloadBlobAsync(It.IsAny(), It.IsAny())) .Returns(Task.FromResult(azureProxyReturnValues.DownloadedBlobContent)); azureProxy.Setup(a => a.LocalFileExists(It.IsAny())) .Returns(azureProxyReturnValues.LocalFileExists); - azureProxy.Setup(a => a.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) - .Returns((Pool p, bool _1, System.Threading.CancellationToken _2) => Task.FromResult(azureProxyReturnValues.CreateBatchPoolImpl(p))); + azureProxy.Setup(a => a.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) + .Returns((Pool p, bool _1, CancellationToken _2) => Task.FromResult(azureProxyReturnValues.CreateBatchPoolImpl(p))); - azureProxy.Setup(a => a.GetFullAllocationStateAsync(It.IsAny(), It.IsAny())) + azureProxy.Setup(a => a.GetFullAllocationStateAsync(It.IsAny(), It.IsAny())) .Returns(Task.FromResult(azureProxyReturnValues.AzureProxyGetFullAllocationState?.Invoke() ?? new(null, null, null, null, null, null, null))); azureProxy.Setup(a => a.ListComputeNodesAsync(It.IsAny(), It.IsAny())) @@ -1701,14 +1702,14 @@ private static Action> GetMockAzureProxy(AzureProxyReturnValue => AsyncEnumerable.Empty() .Append(BatchPoolTests.GenerateNode(poolId, "ComputeNodeDedicated1", true, true)))); - azureProxy.Setup(a => a.DeleteBatchPoolAsync(It.IsAny(), It.IsAny())) - .Callback((poolId, cancellationToken) => azureProxyReturnValues.AzureProxyDeleteBatchPoolImpl(poolId, cancellationToken)) + azureProxy.Setup(a => a.DeleteBatchPoolAsync(It.IsAny(), It.IsAny())) + .Callback((poolId, cancellationToken) => azureProxyReturnValues.AzureProxyDeleteBatchPoolImpl(poolId, cancellationToken)) .Returns(Task.CompletedTask); azureProxy.Setup(a => a.ListTasksAsync(It.IsAny(), It.IsAny())) .Returns(azureProxyReturnValues.AzureProxyListTasks); - azureProxy.Setup(a => a.ListBlobsAsync(It.IsAny(), It.IsAny())) + azureProxy.Setup(a => a.ListBlobsAsync(It.IsAny(), It.IsAny())) .Returns(AsyncEnumerable.Empty<(string, Uri)>()); }; @@ -1776,7 +1777,7 @@ private static TestServices.TestServiceProvider GetServiceProvi } private static async Task AddPool(BatchScheduler batchScheduler) - => (BatchPool)await batchScheduler.GetOrAddPoolAsync("key1", false, (id, cancellationToken) => ValueTask.FromResult(new(name: id, displayName: "display1", vmSize: "vmSize1")), System.Threading.CancellationToken.None); + => (BatchPool)await batchScheduler.GetOrAddPoolAsync("key1", false, (id, cancellationToken) => ValueTask.FromResult(new(name: id, displayName: "display1", vmSize: "vmSize1")), CancellationToken.None); internal static void GuardAssertsWithTesTask(TesTask tesTask, Action assertBlock) { @@ -1841,8 +1842,8 @@ private struct BatchTaskStates private class AzureProxyReturnValues { internal Func AzureProxyGetFullAllocationState { get; set; } - internal Action AzureProxyDeleteBatchPoolIfExists { get; set; } - internal Action AzureProxyDeleteBatchPool { get; set; } + internal Action AzureProxyDeleteBatchPoolIfExists { get; set; } + internal Action AzureProxyDeleteBatchPool { get; set; } internal Func> AzureProxyListTasks { get; set; } = (jobId, detail) => AsyncEnumerable.Empty(); public Dictionary StorageAccountInfos { get; set; } public ContainerRegistryInfo ContainerRegistryInfo { get; set; } @@ -1902,7 +1903,7 @@ private static AzureProxyReturnValues DefaultsPerVMFamilyEnforcedImpl() private readonly Dictionary> poolMetadata = new(); - internal void AzureProxyDeleteBatchPoolImpl(string poolId, System.Threading.CancellationToken cancellationToken) + internal void AzureProxyDeleteBatchPoolImpl(string poolId, CancellationToken cancellationToken) { _ = poolMetadata.Remove(poolId); AzureProxyDeleteBatchPool(poolId, cancellationToken); @@ -1935,10 +1936,10 @@ private class TestBatchQuotaVerifierQuotaMaxedOut : TestBatchQuotaVerifierBase { public TestBatchQuotaVerifierQuotaMaxedOut(IBatchQuotaProvider batchQuotaProvider) : base(batchQuotaProvider) { } - public override Task<(int exceeded, Exception exception)> CheckBatchAccountPoolOrJobQuotasAsync(int required, CancellationToken cancellationToken) - => throw new AzureBatchQuotaMaxedOutException("Test AzureBatchQuotaMaxedOutException"); + public override Task CheckBatchAccountPoolAndJobQuotasAsync(int required, CancellationToken cancellationToken) + => Task.FromResult(new CheckGroupPoolAndJobQuotaResult(required / 2, new AzureBatchQuotaMaxedOutException("Test AzureBatchQuotaMaxedOutException"))); - public override Task CheckBatchAccountQuotasAsync(VirtualMachineInformation _1, bool _2, System.Threading.CancellationToken cancellationToken) + public override Task CheckBatchAccountQuotasAsync(VirtualMachineInformation _1, bool _2, CancellationToken cancellationToken) => throw new AzureBatchQuotaMaxedOutException("Test AzureBatchQuotaMaxedOutException"); } @@ -1946,10 +1947,10 @@ private class TestBatchQuotaVerifierLowQuota : TestBatchQuotaVerifierBase { public TestBatchQuotaVerifierLowQuota(IBatchQuotaProvider batchQuotaProvider) : base(batchQuotaProvider) { } - public override Task<(int exceeded, Exception exception)> CheckBatchAccountPoolOrJobQuotasAsync(int required, CancellationToken cancellationToken) + public override Task CheckBatchAccountPoolAndJobQuotasAsync(int required, CancellationToken cancellationToken) => throw new NotSupportedException(); - public override Task CheckBatchAccountQuotasAsync(VirtualMachineInformation _1, bool _2, System.Threading.CancellationToken cancellationToken) + public override Task CheckBatchAccountQuotasAsync(VirtualMachineInformation _1, bool _2, CancellationToken cancellationToken) => throw new AzureBatchLowQuotaException("Test AzureBatchLowQuotaException"); } @@ -1960,9 +1961,9 @@ private abstract class TestBatchQuotaVerifierBase : IBatchQuotaVerifier protected TestBatchQuotaVerifierBase(IBatchQuotaProvider batchQuotaProvider) => this.batchQuotaProvider = batchQuotaProvider; - public abstract Task<(int exceeded, Exception exception)> CheckBatchAccountPoolOrJobQuotasAsync(int required, CancellationToken cancellationToken); + public abstract Task CheckBatchAccountPoolAndJobQuotasAsync(int required, CancellationToken cancellationToken); - public abstract Task CheckBatchAccountQuotasAsync(VirtualMachineInformation virtualMachineInformation, bool needPoolOrJobQuotaCheck, System.Threading.CancellationToken cancellationToken); + public abstract Task CheckBatchAccountQuotasAsync(VirtualMachineInformation virtualMachineInformation, bool needPoolOrJobQuotaCheck, CancellationToken cancellationToken); public IBatchQuotaProvider GetBatchQuotaProvider() => batchQuotaProvider; diff --git a/src/TesApi.Web/AzureBatchTaskState.cs b/src/TesApi.Web/AzureBatchTaskState.cs index ce096bbde..254fb2df8 100644 --- a/src/TesApi.Web/AzureBatchTaskState.cs +++ b/src/TesApi.Web/AzureBatchTaskState.cs @@ -129,9 +129,9 @@ protected AzureBatchTaskState(AzureBatchTaskState other, string appendToSystemLo { Failure = other.Failure switch { - null => new("Unknown", Enumerable.Empty().Append(appendToSystemLog)), - { SystemLogs: null } => new(other.Failure.Reason ?? "Unknown", Enumerable.Empty().Append(appendToSystemLog)), - _ => new(other.Failure.Reason ?? "Unknown", other.Failure.SystemLogs.Append(appendToSystemLog)), + null => new("UnknownError", Enumerable.Empty().Append(appendToSystemLog)), + { SystemLogs: null } => new(other.Failure.Reason ?? "UnknownError", Enumerable.Empty().Append(appendToSystemLog)), + _ => new(other.Failure.Reason ?? "UnknownError", other.Failure.SystemLogs.Append(appendToSystemLog)), }; } } diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index da2e1e730..8857090b9 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -11,6 +11,7 @@ using Microsoft.Azure.Batch.Common; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; +using static TesApi.Web.IBatchPool; namespace TesApi.Web { @@ -544,7 +545,7 @@ internal async ValueTask ServicePoolAsync(ServiceKind serviceKind, CancellationT } /// - public async IAsyncEnumerable<(string taskId, AzureBatchTaskState)> ServicePoolAsync([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + public async ValueTask ServicePoolAsync(CancellationToken cancellationToken) { var exceptions = new List(); @@ -556,12 +557,7 @@ await PerformTask(ServicePoolAsync(ServiceKind.Rotate, cancellationToken), cance switch (exceptions.Count) { case 0: - await foreach (var (id, state) in GetTasksAsync("id", "state eq 'active'").Select(cloud => cloud.Id).Zip(GetFailures(cancellationToken), (id, state) => (id, state)).WithCancellation(cancellationToken)) - { - yield return (id, state); - } - - yield break; + return; case 1: throw exceptions.First(); @@ -577,6 +573,7 @@ static IEnumerable Flatten(Exception ex) _ => Enumerable.Empty().Append(ex), }; + // Returns true to continue to the next action async ValueTask PerformTask(ValueTask serviceAction, CancellationToken cancellationToken) { if (!cancellationToken.IsCancellationRequested) @@ -596,7 +593,7 @@ async ValueTask PerformTask(ValueTask serviceAction, CancellationToken can return false; } - // Returns false when pool/job was removed because it was not found. Returns true if the error was completely something else. + // Returns false when pool/job was removed because it was not found. Returns true otherwise. async ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken cancellationToken) { switch (ex) @@ -622,19 +619,33 @@ async ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken ca } return true; } + } + + /// + public IAsyncEnumerable GetTaskResizeFailures(CancellationToken cancellationToken) + { + return GetTasksAsync("id", "state eq 'active'").Zip( + GetFailures(cancellationToken), + (cloud, state) => new CloudTaskBatchTaskState(cloud.Id, state)); #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously async IAsyncEnumerable GetFailures([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously { + cancellationToken.ThrowIfCancellationRequested(); + for (var failure = PopNextStartTaskFailure(); failure is not null; failure = PopNextStartTaskFailure()) { yield return ConvertFromStartTask(failure); + cancellationToken.ThrowIfCancellationRequested(); } + cancellationToken.ThrowIfCancellationRequested(); + for (var failure = PopNextResizeError(); failure is not null; failure = PopNextResizeError()) { yield return ConvertFromResize(failure); + cancellationToken.ThrowIfCancellationRequested(); } } diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index 313e6ac19..ec560534c 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -11,6 +11,7 @@ using Microsoft.Extensions.Logging; using Tes.Models; using Tes.Repository; +using static TesApi.Web.IBatchPool; namespace TesApi.Web { @@ -104,7 +105,14 @@ private async ValueTask ExecuteServiceBatchPoolsAsync(CancellationToken stopping { var list = new ConcurrentBag<(TesTask TesTask, AzureBatchTaskState State)>(); - await ExecuteActionOnPoolsAsync("Service Batch Pools", (pool, token) => ProcessFailures(pool.ServicePoolAsync(token), token), stoppingToken); + await ExecuteActionOnPoolsAsync( + "Service Batch Pools", + async (pool, token) => + { + await pool.ServicePoolAsync(token); + await ProcessFailures(pool.GetTaskResizeFailures(token), token); + }, + stoppingToken); await OrchestrateTesTasksOnBatchAsync( "Failures", @@ -114,7 +122,7 @@ await OrchestrateTesTasksOnBatchAsync( (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, list.Select(t => t.State).ToArray(), token), stoppingToken); - async ValueTask ProcessFailures(IAsyncEnumerable<(string taskId, AzureBatchTaskState)> failures, CancellationToken cancellationToken) + async ValueTask ProcessFailures(IAsyncEnumerable failures, CancellationToken cancellationToken) { await foreach (var (cloudTaskId, state) in failures.WithCancellation(cancellationToken)) { @@ -170,7 +178,7 @@ async IAsyncEnumerable GetTesTasks([System.Runtime.CompilerServices.Enu } else { - logger.LogDebug("Could not find task {TesTask}.", tesTaskId); + logger.LogDebug("Could not find completed task {TesTask}.", tesTaskId); yield return null; } } diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 20b5db103..97315f27f 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -388,12 +388,15 @@ public IAsyncEnumerable> ProcessTesTaskBatchStatesAsync(IEnume ArgumentNullException.ThrowIfNull(taskStates); return taskStates.Zip(tesTasks, (TaskState, TesTask) => (TaskState, TesTask)) - .Where(entry => entry.TesTask?.IsActiveState() ?? false) // Removes already terminal TesTasks from being further processed. + .Where(entry => entry.TesTask?.IsActiveState() ?? false) // Removes already terminal (and null) TesTasks from being further processed. .Select(entry => new TesTaskTask(WrapHandleTesTaskTransitionAsync(entry.TesTask, entry.TaskState, cancellationToken), entry.TesTask)) .WhenEach(cancellationToken, tesTaskTask => tesTaskTask.Task); async Task WrapHandleTesTaskTransitionAsync(TesTask tesTask, AzureBatchTaskState azureBatchTaskState, CancellationToken cancellationToken) => await HandleTesTaskTransitionAsync(tesTask, azureBatchTaskState, cancellationToken); + + //Task WrapHandleTesTaskTransitionAsync(TesTask tesTask, AzureBatchTaskState azureBatchTaskState, CancellationToken cancellationToken) + // => Task.Run(async () => await HandleTesTaskTransitionAsync(tesTask, azureBatchTaskState, cancellationToken)); } private static string GetCromwellExecutionDirectoryPathAsUrl(TesTask task) @@ -475,93 +478,98 @@ private static bool IsCromwellCommandScript(TesInput inputFile) public async IAsyncEnumerable> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { var tasksMetadataByPoolKey = new Dictionary Identities, string PoolDisplayName)>>(); - var poolKeyByTaskIds = new Dictionary(); - var tasks = tesTasks.ToList(); + var poolKeyByTaskIds = new Dictionary(); // Reverse lookup of 'tasksMetadataByPoolKey' - // Determine how many nodes in each pool we might need for this group. - foreach (var tesTask in tesTasks) // TODO: Consider parallelizing this foreach loop { - Task quickResult = default; - string poolKey = default; - var identities = new List(); + var tasks = tesTasks.ToList(); // List of tasks that will make it to the next round. - if (!string.IsNullOrWhiteSpace(globalManagedIdentity)) + // Determine how many nodes in each pool we might need for this group. + foreach (var tesTask in tesTasks) // TODO: Consider parallelizing this foreach loop. { - identities.Add(globalManagedIdentity); - } + Task quickResult = default; // fast exit enabler + string poolKey = default; + var identities = new List(); - if (tesTask.Resources?.ContainsBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity) == true) - { - identities.Add(tesTask.Resources?.GetBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity)); - } + if (!string.IsNullOrWhiteSpace(globalManagedIdentity)) + { + identities.Add(globalManagedIdentity); + } - try - { - var virtualMachineInfo = await GetVmSizeAsync(tesTask, cancellationToken); - var containerMetadata = await GetContainerConfigurationIfNeededAsync(tesTask, cancellationToken); - (poolKey, var displayName) = GetPoolKey(tesTask, virtualMachineInfo, containerMetadata.ContainerConfiguration, identities, cancellationToken); - await quotaVerifier.CheckBatchAccountQuotasAsync(virtualMachineInfo, needPoolOrJobQuotaCheck: !IsPoolAvailable(poolKey), cancellationToken: cancellationToken); + if (tesTask.Resources?.ContainsBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity) == true) + { + identities.Add(tesTask.Resources?.GetBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity)); + } - if (tasksMetadataByPoolKey.TryGetValue(poolKey, out var resource)) + try { - resource.Add((tesTask, virtualMachineInfo, containerMetadata, identities, displayName)); + var virtualMachineInfo = await GetVmSizeAsync(tesTask, cancellationToken); + var containerMetadata = await GetContainerConfigurationIfNeededAsync(tesTask, cancellationToken); + (poolKey, var displayName) = GetPoolKey(tesTask, virtualMachineInfo, containerMetadata.ContainerConfiguration, identities, cancellationToken); + await quotaVerifier.CheckBatchAccountQuotasAsync(virtualMachineInfo, needPoolOrJobQuotaCheck: !IsPoolAvailable(poolKey), cancellationToken: cancellationToken); + + if (tasksMetadataByPoolKey.TryGetValue(poolKey, out var resource)) + { + resource.Add((tesTask, virtualMachineInfo, containerMetadata, identities, displayName)); + } + else + { + tasksMetadataByPoolKey.Add(poolKey, new() { (tesTask, virtualMachineInfo, containerMetadata, identities, displayName) }); + } + + poolKeyByTaskIds.Add(tesTask.Id, poolKey); } - else + catch (Exception ex) { - tasksMetadataByPoolKey.Add(poolKey, new() { (tesTask, virtualMachineInfo, containerMetadata, identities, displayName) }); + quickResult = HandleException(ex, poolKey, tesTask); } - poolKeyByTaskIds.Add(tesTask.Id, poolKey); - } - catch (Exception ex) - { - quickResult = HandleException(ex, poolKey, tesTask); + if (quickResult is not null) + { + tasks.Remove(tesTask); + yield return new(quickResult, tesTask); + } } - if (quickResult is not null) - { - tasks.Remove(tesTask); - yield return new(quickResult, tesTask); - } + // Remove already returned tasks from the dictionary + tasksMetadataByPoolKey = tasksMetadataByPoolKey + .Select(p => (p.Key, Value: p.Value.Where(v => tasks.Contains(v.TesTask)).ToList())) // keep only tasks that remain in the 'tasks' variable + .Where(t => t.Value.Count != 0) // Remove any now empty pool keys + .ToDictionary(p => p.Key, p => p.Value); } - // Remove already returned tasks from the dictionary - tasksMetadataByPoolKey = tasksMetadataByPoolKey - .Select(p => (p.Key, Value: p.Value.Where(v => tasks.Contains(v.TesTask)).ToList())) // keep only tasks that remain in the tasks variable - .Where(t => t.Value.Count != 0) // Remove any empty pool keys - .ToDictionary(p => p.Key, p => p.Value); - // Determine how many nodes in each new pool we might need for this group. var neededPoolNodesByPoolKey = tasksMetadataByPoolKey.ToDictionary(t => t.Key, t => t.Value.Count); - // Determine how many new pools/jobs we will need for this batch - var requiredNewPools = neededPoolNodesByPoolKey.Where(t => !IsPoolAvailable(t.Key)).Count(); - - // Revisit pool/job quotas (the above loop already dealt with the possiblility of needing just one more pool/job) - if (requiredNewPools > 1) { - var (excess, exception) = await quotaVerifier.CheckBatchAccountPoolOrJobQuotasAsync(requiredNewPools, cancellationToken); - var initial = tasksMetadataByPoolKey.Count - 1; - var final = initial - excess; + // Determine how many new pools/jobs we will need for this batch + var requiredNewPools = neededPoolNodesByPoolKey.Where(t => !IsPoolAvailable(t.Key)).Count(); - for (var i = initial; i > final; --i) + // Revisit pool/job quotas (the above loop already dealt with the possiblility of needing just one more pool/job). + // This will remove pool keys we cannot accomodate due to quota, along with all of their associated tasks, from being queued into Batch. + if (requiredNewPools > 1) { - var key = tasksMetadataByPoolKey.Keys.ElementAt(i); - if (tasksMetadataByPoolKey.Remove(key, out var listOfTaskMetadata)) + var (excess, exception) = await quotaVerifier.CheckBatchAccountPoolAndJobQuotasAsync(requiredNewPools, cancellationToken); + var initial = tasksMetadataByPoolKey.Count - 1; + var final = initial - excess; + + for (var i = initial; i > final; --i) { - foreach (var (task, _, _, _, _) in listOfTaskMetadata) + var key = tasksMetadataByPoolKey.Keys.ElementAt(i); + if (tasksMetadataByPoolKey.Remove(key, out var listOfTaskMetadata)) { - tasks.Remove(task); - yield return new(HandleException(exception, key, task), task); + foreach (var (task, _, _, _, _) in listOfTaskMetadata) + { + yield return new(HandleException(exception, key, task), task); + } } } } } // Obtain assigned pool and create and assign the cloudtask for each task. - foreach (var (tesTask, virtualMachineInfo, containerMetadata, identities, displayName) in tasksMetadataByPoolKey.Values.SelectMany(e => e)) + foreach (var (tesTask, virtualMachineInfo, containerMetadata, identities, displayName) in tasksMetadataByPoolKey.Values.SelectMany(e => e)) // TODO: Consider parallelizing this foreach loop. Would require making GetOrAddPoolAsync multi-threaded safe. { - Task quickResult = default; + Task quickResult = default; // fast exit enabler var poolKey = poolKeyByTaskIds[tesTask.Id]; try @@ -720,16 +728,6 @@ Task HandleException(Exception exception, string poolKey, TesTask tesTask) /// Current Azure Batch task info /// A for controlling the lifetime of the asynchronous operation. /// True if the TES task was changed. - // When task is executed the following may be touched: - // tesTask.Log[].SystemLog - // tesTask.Log[].FailureReason - // tesTask.Log[].CromwellResultCode - // tesTask.Log[].BatchExecutionMetrics - // tesTask.Log[].EndTime - // tesTask.Log[].Log[].StdErr - // tesTask.Log[].Log[].ExitCode - // tesTask.Log[].Log[].StartTime - // tesTask.Log[].Log[].EndTime private ValueTask HandleTesTaskTransitionAsync(TesTask tesTask, AzureBatchTaskState azureBatchTaskState, CancellationToken cancellationToken) => tesTaskStateTransitions .FirstOrDefault(m => (m.Condition is null || m.Condition(tesTask)) && (m.CurrentBatchTaskState is null || m.CurrentBatchTaskState == azureBatchTaskState.State)) @@ -787,6 +785,7 @@ private async Task> GetAdditionalCromwellInputsAsync(TesTask tas // WORKAROUND: Get the list of files in the execution directory and add them to task inputs. // TODO: Verify whether this workaround is still needed. var additionalInputs = new List(); + if (cromwellExecutionDirectoryUrl is not null) { additionalInputs = @@ -800,7 +799,7 @@ await GetExistingBlobsInCromwellStorageLocationAsTesInputsAsync(task, cromwellEx private async Task> GetExistingBlobsInCromwellStorageLocationAsTesInputsAsync(TesTask task, string cromwellExecutionDirectoryUrl, CancellationToken cancellationToken) { - var additionalInputFiles = new List(); + List additionalInputFiles = default; if (!Uri.TryCreate(cromwellExecutionDirectoryUrl, UriKind.Absolute, out _)) { @@ -814,12 +813,12 @@ private async Task> GetExistingBlobsInCromwellStorageLocationAsTe { var blobsInExecutionDirectory = await azureProxy.ListBlobsAsync(new Uri(executionDirectoryUri), cancellationToken).ToListAsync(cancellationToken); - logger.LogDebug($"Found {blobsInExecutionDirectory.Count} items in cromwell's task execution directory."); var scriptBlob = blobsInExecutionDirectory.FirstOrDefault(b => b.Name.EndsWith($"/{CromwellScriptFileName}")); var commandScript = task.Inputs?.FirstOrDefault( IsCromwellCommandScript); // this should never be null because it's used to set isCromwell + logger.LogDebug($"GetExistingBlobsInCromwellStorageLocationAsTesInputsAsync: scriptBlob is default: {scriptBlob == default} commandScript is null: {commandScript is null}"); if (scriptBlob != default) { @@ -843,7 +842,7 @@ private async Task> GetExistingBlobsInCromwellStorageLocationAsTe } } - return additionalInputFiles; + return additionalInputFiles ?? new(); } private void ValidateTesTask(TesTask task) @@ -956,6 +955,7 @@ var s when s.StartsWith("batch.node.centos ") => "sudo yum install epel-release /// The to schedule on Azure Batch /// A for controlling the lifetime of the asynchronous operation. /// + // TODO: remove this as soon as the node runner can authenticate to container registries private async ValueTask<(BatchModels.ContainerConfiguration ContainerConfiguration, (bool ExecutorImage, bool DockerInDockerImage, bool CromwellDrsImage) IsPublic)> GetContainerConfigurationIfNeededAsync(TesTask tesTask, CancellationToken cancellationToken) { var drsImageNeeded = tesTask.Inputs?.Any(i => i?.Url?.StartsWith("drs://") ?? false) ?? false; diff --git a/src/TesApi.Web/IBatchPool.cs b/src/TesApi.Web/IBatchPool.cs index 7038742ae..936abb90e 100644 --- a/src/TesApi.Web/IBatchPool.cs +++ b/src/TesApi.Web/IBatchPool.cs @@ -51,8 +51,14 @@ public interface IBatchPool /// /// /// Calls each internal servicing method in order. Throws all exceptions gathered from all methods. + ValueTask ServicePoolAsync(CancellationToken cancellationToken = default); + + /// + /// Gets nonrecoverable compute node related failures that occur before tasks are assigned to compute nodes. + /// + /// /// - IAsyncEnumerable<(string taskId, AzureBatchTaskState)> ServicePoolAsync(CancellationToken cancellationToken = default); + IAsyncEnumerable GetTaskResizeFailures(CancellationToken cancellationToken = default); /// /// Gets the last time the pool's compute node list was changed. @@ -67,5 +73,12 @@ public interface IBatchPool /// /// IAsyncEnumerable GetCompletedTasks(CancellationToken cancellationToken); + + /// + /// A not yet assigned a compute nodes to remove due to a nonrecoverable compute node or pool resize error. + /// + /// A s not yet assigned a compute node. + /// A compute node and/or pool resize error. + public record CloudTaskBatchTaskState(string CloudTaskId, AzureBatchTaskState TaskState); } } diff --git a/src/TesApi.Web/Management/ArmBatchQuotaProvider.cs b/src/TesApi.Web/Management/ArmBatchQuotaProvider.cs index 6c4015642..87af6b6af 100644 --- a/src/TesApi.Web/Management/ArmBatchQuotaProvider.cs +++ b/src/TesApi.Web/Management/ArmBatchQuotaProvider.cs @@ -10,6 +10,7 @@ using Microsoft.Extensions.Caching.Memory; using Microsoft.Extensions.Logging; using TesApi.Web.Management.Models.Quotas; +using static TesApi.Web.Management.IBatchQuotaProvider; namespace TesApi.Web.Management; @@ -75,10 +76,10 @@ public async Task GetVmCoreQuotaAsync(bool lowPriority, Cancel } /// - public async Task<(int PoolQuota, int ActiveJobAndJobScheduleQuota)> GetPoolOrJobQuotaAsync(CancellationToken cancellationToken) + public async Task GetPoolAndJobQuotaAsync(CancellationToken cancellationToken) { var quotas = await GetBatchAccountQuotasAsync(cancellationToken); - return (quotas.PoolQuota, quotas.ActiveJobAndJobScheduleQuota); + return new(quotas.PoolQuota, quotas.ActiveJobAndJobScheduleQuota); } /// diff --git a/src/TesApi.Web/Management/BatchQuotaVerifier.cs b/src/TesApi.Web/Management/BatchQuotaVerifier.cs index e11044713..3eb6e0858 100644 --- a/src/TesApi.Web/Management/BatchQuotaVerifier.cs +++ b/src/TesApi.Web/Management/BatchQuotaVerifier.cs @@ -8,6 +8,7 @@ using Microsoft.Extensions.Logging; using Tes.Models; using TesApi.Web.Management.Models.Quotas; +using static TesApi.Web.Management.IBatchQuotaVerifier; namespace TesApi.Web.Management; @@ -121,25 +122,25 @@ public async Task CheckBatchAccountQuotasAsync(VirtualMachineInformation virtual } /// - public async Task<(int exceeded, Exception exception)> CheckBatchAccountPoolOrJobQuotasAsync(int required, CancellationToken cancellationToken) + public async Task CheckBatchAccountPoolAndJobQuotasAsync(int required, CancellationToken cancellationToken) { - var (poolQuota, activeJobAndJobScheduleQuota) = await batchQuotaProvider.GetPoolOrJobQuotaAsync(cancellationToken); + var (poolQuota, activeJobAndJobScheduleQuota) = await batchQuotaProvider.GetPoolAndJobQuotaAsync(cancellationToken); return CheckBatchAccountPoolOrJobQuotasImpl(required, azureProxy.GetBatchActiveJobCount(), azureProxy.GetBatchActivePoolCount(), activeJobAndJobScheduleQuota, poolQuota); } - private static (int exceeded, Exception exception) CheckBatchAccountPoolOrJobQuotasImpl(int required, int activeJobsCount, int activePoolsCount, int activeJobAndJobScheduleQuota, int poolQuota) + private static CheckGroupPoolAndJobQuotaResult CheckBatchAccountPoolOrJobQuotasImpl(int required, int activeJobsCount, int activePoolsCount, int activeJobAndJobScheduleQuota, int poolQuota) { if (activeJobsCount + required > activeJobAndJobScheduleQuota) { - return (activeJobsCount + required - activeJobAndJobScheduleQuota, new AzureBatchQuotaMaxedOutException($"No remaining active jobs quota available. There are {activePoolsCount} active jobs out of {activeJobAndJobScheduleQuota}.")); + return new(activeJobsCount + required - activeJobAndJobScheduleQuota, new AzureBatchQuotaMaxedOutException($"No remaining active jobs quota available. There are {activePoolsCount} active jobs (with {required} more being created) out of {activeJobAndJobScheduleQuota}.")); } if (activePoolsCount + required > poolQuota) { - return (activePoolsCount + required - poolQuota, new AzureBatchQuotaMaxedOutException($"No remaining pool quota available. There are {activePoolsCount} pools in use out of {poolQuota}.")); + return new(activePoolsCount + required - poolQuota, new AzureBatchQuotaMaxedOutException($"No remaining pool quota available. There are {activePoolsCount} pools in use (with {required} more being created) out of {poolQuota}.")); } - return (0, null); + return new(0, null); } /// diff --git a/src/TesApi.Web/Management/IBatchQuotaProvider.cs b/src/TesApi.Web/Management/IBatchQuotaProvider.cs index 33c24e878..d1e176374 100644 --- a/src/TesApi.Web/Management/IBatchQuotaProvider.cs +++ b/src/TesApi.Web/Management/IBatchQuotaProvider.cs @@ -39,6 +39,13 @@ Task GetQuotaForRequirementAsync( /// /// A for controlling the lifetime of the asynchronous operation. /// - Task<(int PoolQuota, int ActiveJobAndJobScheduleQuota)> GetPoolOrJobQuotaAsync(CancellationToken cancellationToken); + Task GetPoolAndJobQuotaAsync(CancellationToken cancellationToken); + + /// + /// Pool and Active Job and JobSchedule quotas. + /// + /// Pool quota. + /// Active Job and JobSchedule quota. + public record struct PoolAndJobQuota(int PoolQuota, int ActiveJobAndJobScheduleQuota); } } diff --git a/src/TesApi.Web/Management/IBatchQuotaVerifier.cs b/src/TesApi.Web/Management/IBatchQuotaVerifier.cs index 8e9e48e56..fb5179edd 100644 --- a/src/TesApi.Web/Management/IBatchQuotaVerifier.cs +++ b/src/TesApi.Web/Management/IBatchQuotaVerifier.cs @@ -27,15 +27,22 @@ public interface IBatchQuotaVerifier /// /// Checks if the current quota allows creation of the requested quantity of new Pools and Jobs. /// - /// + /// The quantity of new pools and jobs that need to be accomodated for success. /// A for controlling the lifetime of the asynchronous operation. - /// The size of the portion of that would've resulted in the returned . - Task<(int exceeded, Exception exception)> CheckBatchAccountPoolOrJobQuotasAsync(int required, CancellationToken cancellationToken); + /// A that returns the size of the portion of that would've resulted in the provided . + Task CheckBatchAccountPoolAndJobQuotasAsync(int required, CancellationToken cancellationToken); /// /// Gets the instance of the batch quota provider. /// /// Batch quota provider IBatchQuotaProvider GetBatchQuotaProvider(); + + /// + /// Result of group checking quota for pools and jobs. + /// + /// The number of pools or jobs above the "required" request that exceeded the available quota. + /// The to return to the tasks that could not be accomodated. + public record struct CheckGroupPoolAndJobQuotaResult(int Exceeded, Exception Exception); } } diff --git a/src/TesApi.Web/Management/TerraQuotaProvider.cs b/src/TesApi.Web/Management/TerraQuotaProvider.cs index d0e98ccd8..70bb86684 100644 --- a/src/TesApi.Web/Management/TerraQuotaProvider.cs +++ b/src/TesApi.Web/Management/TerraQuotaProvider.cs @@ -11,6 +11,7 @@ using TesApi.Web.Management.Configuration; using TesApi.Web.Management.Models.Quotas; using TesApi.Web.Management.Models.Terra; +using static TesApi.Web.Management.IBatchQuotaProvider; namespace TesApi.Web.Management { @@ -86,10 +87,10 @@ public async Task GetVmCoreQuotaAsync(bool lowPriority, Cancel } /// - public async Task<(int PoolQuota, int ActiveJobAndJobScheduleQuota)> GetPoolOrJobQuotaAsync(CancellationToken cancellationToken) + public async Task GetPoolAndJobQuotaAsync(CancellationToken cancellationToken) { var quotas = await GetBatchAccountQuotaFromTerraAsync(cancellationToken); - return (quotas.QuotaValues.PoolQuota, quotas.QuotaValues.ActiveJobAndJobScheduleQuota); + return new(quotas.QuotaValues.PoolQuota, quotas.QuotaValues.ActiveJobAndJobScheduleQuota); } private async Task GetBatchAccountQuotaFromTerraAsync(CancellationToken cancellationToken) diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 434b06676..68bb1142e 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -145,7 +145,7 @@ protected async Task ExecuteActionOnIntervalAsync(TimeSpan runInterval, Func, performs an action in the batch system using , and updates the resultant state /// /// A System.Threading.Tasks.ValueTask that represents the long running operations. - protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func>> tesTaskGetter, Func>> tesTaskProcessor, CancellationToken stoppingToken) + protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func>> tesTaskGetter, Func>> tesTaskProcessor, CancellationToken stoppingToken, string unitsLabel = "tasks") { var tesTasks = await (await tesTaskGetter(stoppingToken)).ToArrayAsync(stoppingToken); @@ -281,7 +281,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< await batchScheduler.FlushPoolsAsync(pools, stoppingToken); } - logger.LogDebug("OrchestrateTesTasksOnBatch({Poll}) for {TaskCount} tasks completed in {TotalSeconds} seconds.", pollName, tesTasks.Where(task => task is not null).Count(), DateTime.UtcNow.Subtract(startTime).TotalSeconds); + logger.LogDebug("OrchestrateTesTasksOnBatch({Poll}) for {TaskCount} {UnitsLabel} completed in {TotalSeconds} seconds.", pollName, tesTasks.Where(task => task is not null).Count(), unitsLabel, DateTime.UtcNow.Subtract(startTime).TotalSeconds); } } } diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index c4d885a60..c8622ff4c 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -181,7 +181,8 @@ await OrchestrateTesTasksOnBatchAsync( async token => GetTesTasks(token), #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, messages.Select(t => t.State).ToArray(), token), - stoppingToken); + stoppingToken, + "events"); // Helpers async ValueTask ProcessMessage(NodeEventMessage messageInfo, CancellationToken cancellationToken) @@ -197,12 +198,12 @@ async IAsyncEnumerable GetTesTasks([System.Runtime.CompilerServices.Enu TesTask tesTask = default; if (await repository.TryGetItemAsync(id, cancellationToken, task => tesTask = task) && tesTask is not null) { - logger.LogDebug("Completing task {TesTask}.", tesTask.Id); + logger.LogDebug("Completing event for task {TesTask}.", tesTask.Id); yield return tesTask; } else { - logger.LogDebug("Could not find task {TesTask}.", id); + logger.LogDebug("Could not find task {TesTask} for event.", id); yield return null; } } diff --git a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs index 147d28f8e..f6d7da10e 100644 --- a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs @@ -14,7 +14,6 @@ using Microsoft.Extensions.Options; using Tes.Extensions; using Tes.Models; -using TesApi.Web.Management.Configuration; using TesApi.Web.Options; namespace TesApi.Web.Storage @@ -147,6 +146,16 @@ private Task AddSasTokenAsync(StorageAccountUrlSegmen return AddSasTokenAsyncImpl(pathSegments, sasTokenDuration, (expiresOn, _1) => new BlobSasBuilder(containerPermissions, expiresOn) { BlobName = string.Empty }, path, cancellationToken); } + /// + /// Generates SAS token for both blobs and containers. + /// + /// Target of SAS token. + /// Length of time from now for which SAS token should remain valid. + /// A factory that generates a . Receives the expiration time and the blobName, which should be set on the sas builder as appropriate. + /// Logging metadata for failures locating storage account. + /// + /// + /// private async Task AddSasTokenAsyncImpl(StorageAccountUrlSegments pathSegments, TimeSpan? sasTokenDuration, Func createBuilder, string path, CancellationToken cancellationToken) { StorageAccountInfo storageAccountInfo = null; From ba71c1e87ad76163772a5c63d518a34afcfa14ee Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 26 Oct 2023 18:51:59 -0700 Subject: [PATCH 053/202] Continue debugging --- src/TesApi.Web/BatchScheduler.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 97315f27f..6bfa7f4b6 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -818,7 +818,6 @@ private async Task> GetExistingBlobsInCromwellStorageLocationAsTe var commandScript = task.Inputs?.FirstOrDefault( IsCromwellCommandScript); // this should never be null because it's used to set isCromwell - logger.LogDebug($"GetExistingBlobsInCromwellStorageLocationAsTesInputsAsync: scriptBlob is default: {scriptBlob == default} commandScript is null: {commandScript is null}"); if (scriptBlob != default) { @@ -827,6 +826,7 @@ private async Task> GetExistingBlobsInCromwellStorageLocationAsTe if (commandScript is not null) { + logger.LogDebug($"GetExistingBlobsInCromwellStorageLocationAsTesInputsAsync: commandScript path: {commandScript.Path} blobsInExecutionDirectory: '{string.Join("', '", blobsInExecutionDirectory.Select(b => b.Name))}'"); var expectedPathParts = commandScript.Path.Split('/').Length; additionalInputFiles = blobsInExecutionDirectory From 60db169c9091379d168fed10379dab46bad7fafc Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 26 Oct 2023 23:06:45 -0700 Subject: [PATCH 054/202] Cleanup --- src/TesApi.Tests/BatchSchedulerTests.cs | 21 +++++-- src/TesApi.Web/AzureProxy.cs | 4 +- src/TesApi.Web/BatchScheduler.cs | 56 ++++++++++--------- .../CachingWithRetriesAzureProxy.cs | 5 +- src/TesApi.Web/IAzureProxy.cs | 5 +- .../Management/IBatchQuotaProvider.cs | 7 --- .../Management/IBatchQuotaVerifier.cs | 9 +-- .../Quotas/CheckGroupPoolAndJobQuotaResult.cs | 14 +++++ .../Models/Quotas/PoolAndJobQuota.cs | 12 ++++ src/TesApi.Web/Storage/BlobNameAndUri.cs | 14 +++++ .../Storage/DefaultStorageAccessProvider.cs | 24 +++++++- .../Storage/IStorageAccessProvider.cs | 30 +++++----- 12 files changed, 133 insertions(+), 68 deletions(-) create mode 100644 src/TesApi.Web/Management/Models/Quotas/CheckGroupPoolAndJobQuotaResult.cs create mode 100644 src/TesApi.Web/Management/Models/Quotas/PoolAndJobQuota.cs create mode 100644 src/TesApi.Web/Storage/BlobNameAndUri.cs diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 139afd2bb..143f16c45 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -1386,7 +1386,7 @@ public async Task CromwellWriteFilesAreDiscoveredAndAddedIfMissedWithContentScri serviceProvider.AzureProxy.Setup(p => p.ListBlobsAsync(It.Is(executionDirectoryUri, new UrlMutableSASEqualityComparer()), It.IsAny())).Returns(executionDirectoryBlobs.ToAsyncEnumerable()); - var uri = new UriBuilder(executionDirectoryUri); + var uri = new UriBuilder(executionDirectoryUri) { Query = null }; uri.Path = uri.Path.TrimEnd('/') + $"/{fileName}"; TesInput writeInput = new() { Url = uri.Uri.AbsoluteUri, Path = Path.Combine(Path.GetDirectoryName(script[1]), fileName).Replace('\\', '/'), Type = TesFileType.FILEEnum, Name = "write_", Content = null }; @@ -1407,8 +1407,21 @@ public async Task CromwellWriteFilesAreDiscoveredAndAddedIfMissedWithContentScri Assert.AreEqual(2, filesToDownload.Length); }); - static (string Name, Uri Uri) BlobNameUriFromTesInput(TesInput input) - => (input.Path, UriFromTesInput(input)); + static BlobNameAndUri BlobNameUriFromTesInput(TesInput input) + => new(BlobNameFromTesInput(input), UriFromTesInput(input)); + + static string BlobNameFromTesInput(TesInput input) + { + var uri = UriFromTesInput(input); + + if (uri.IsFile) + { + var trimmedPath = input.Path.TrimStart('/'); + return trimmedPath[trimmedPath.IndexOf('/')..].TrimStart('/'); + } + + return new Azure.Storage.Blobs.BlobUriBuilder(uri).BlobName; + } static Uri UriFromTesInput(TesInput input) { @@ -1710,7 +1723,7 @@ private static Action> GetMockAzureProxy(AzureProxyReturnValue .Returns(azureProxyReturnValues.AzureProxyListTasks); azureProxy.Setup(a => a.ListBlobsAsync(It.IsAny(), It.IsAny())) - .Returns(AsyncEnumerable.Empty<(string, Uri)>()); + .Returns(AsyncEnumerable.Empty()); }; private static Func> GetMockConfig() diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 1bfc465b6..9757627fd 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -398,12 +398,12 @@ public async Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, Ca } /// - public IAsyncEnumerable<(string Name, Uri Uri)> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) + public IAsyncEnumerable ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) { var directory = (new BlobClient(directoryUri, new(BlobClientOptions.ServiceVersion.V2021_04_10))); return directory.GetParentBlobContainerClient() .GetBlobsAsync(prefix: directory.Name.TrimEnd('/') + "/", cancellationToken: cancellationToken) - .Select(blobItem => (blobItem.Name, new BlobUriBuilder(directory.Uri) { Sas = null, BlobName = blobItem.Name }.ToUri())); + .Select(blobItem => new BlobNameAndUri(blobItem.Name, new BlobUriBuilder(directory.Uri) { Sas = null, BlobName = blobItem.Name, Query = null }.ToUri())); } /// diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 6bfa7f4b6..a58f8ddd2 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -410,6 +410,11 @@ var x when string.IsNullOrEmpty(x.Content) => GetParentUrl(commandScript.Url), }; } + private static string GetCromwellExecutionDirectoryPathAsExecutionContainerPath(TesTask task) + { + return task.Inputs?.FirstOrDefault(IsCromwellCommandScript)?.Path; + } + private string GetStorageUploadPath(TesTask task) { return task.Resources?.ContainsBackendParameterValue(TesResources.SupportedBackendParameters.internal_path_prefix) ?? false @@ -800,46 +805,45 @@ private async Task> GetExistingBlobsInCromwellStorageLocationAsTe string cromwellExecutionDirectoryUrl, CancellationToken cancellationToken) { List additionalInputFiles = default; + var scriptPath = GetCromwellExecutionDirectoryPathAsExecutionContainerPath(task); if (!Uri.TryCreate(cromwellExecutionDirectoryUrl, UriKind.Absolute, out _)) { cromwellExecutionDirectoryUrl = $"/{cromwellExecutionDirectoryUrl}"; } - var executionDirectoryUri = await storageAccessProvider.MapLocalPathToSasUrlAsync(cromwellExecutionDirectoryUrl, + var executionDirectoryUriString = await storageAccessProvider.MapLocalPathToSasUrlAsync(cromwellExecutionDirectoryUrl, storageAccessProvider.DefaultContainerPermissions, cancellationToken); + var executionDirectoryUri = string.IsNullOrEmpty(executionDirectoryUriString) ? null : new Uri(executionDirectoryUriString); + if (executionDirectoryUri is not null) { + var executionDirectoryBlobName = new Azure.Storage.Blobs.BlobUriBuilder(executionDirectoryUri).BlobName; + var startOfBlobNameIndex = scriptPath.IndexOf(executionDirectoryBlobName, StringComparison.OrdinalIgnoreCase); + var pathBlobPrefix = scriptPath[..startOfBlobNameIndex]; + var blobsInExecutionDirectory = - await azureProxy.ListBlobsAsync(new Uri(executionDirectoryUri), cancellationToken).ToListAsync(cancellationToken); - var scriptBlob = - blobsInExecutionDirectory.FirstOrDefault(b => b.Name.EndsWith($"/{CromwellScriptFileName}")); - var commandScript = - task.Inputs?.FirstOrDefault( - IsCromwellCommandScript); // this should never be null because it's used to set isCromwell + await azureProxy.ListBlobsAsync(executionDirectoryUri, cancellationToken) + .Select(info => (Path: $"{pathBlobPrefix}{info.BlobName}", Uri: info.BlobUri)) + .ToListAsync(cancellationToken); - if (scriptBlob != default) - { - blobsInExecutionDirectory.Remove(scriptBlob); - } + var scriptBlob = + blobsInExecutionDirectory.FirstOrDefault(b => scriptPath.Equals(b.Path, StringComparison.OrdinalIgnoreCase)); - if (commandScript is not null) - { - logger.LogDebug($"GetExistingBlobsInCromwellStorageLocationAsTesInputsAsync: commandScript path: {commandScript.Path} blobsInExecutionDirectory: '{string.Join("', '", blobsInExecutionDirectory.Select(b => b.Name))}'"); - var expectedPathParts = commandScript.Path.Split('/').Length; + var expectedPathParts = scriptPath.Split('/').Length; - additionalInputFiles = blobsInExecutionDirectory - .Where(b => b.Name.Split('/').Length == expectedPathParts) - .Select(b => new TesInput - { - Path = b.Name, - Url = b.Uri.AbsoluteUri, - Name = Path.GetFileName(b.Name), - Type = TesFileType.FILEEnum - }) - .ToList(); - } + additionalInputFiles = blobsInExecutionDirectory + .Where(b => b != scriptBlob) + .Where(b => b.Path.Split('/').Length == expectedPathParts) + .Select(b => new TesInput + { + Path = b.Path, + Url = b.Uri.AbsoluteUri, + Name = Path.GetFileName(b.Path), + Type = TesFileType.FILEEnum + }) + .ToList(); } return additionalInputFiles ?? new(); diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index c2964bdaf..8dd3a1579 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; using System.Linq; +//using System.Runtime.CompilerServices; using System.Runtime.ExceptionServices; using System.Threading; using System.Threading.Tasks; @@ -91,6 +92,8 @@ public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, strin { } } + //private void LogRetryError(Exception exception, int retryCount, TimeSpan timeSpan, string message, [CallerMemberName] string caller = default) { } + /// public Task DeleteBatchJobAsync(string jobId, CancellationToken cancellationToken) { @@ -227,7 +230,7 @@ public async Task GetStorageAccountInfoAsync(string storageA } /// - public IAsyncEnumerable<(string Name, Uri Uri)> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) + public IAsyncEnumerable ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) { var ctx = new Context(); ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying ListBlobsAsync ({RetryCount}).", retryCount)); diff --git a/src/TesApi.Web/IAzureProxy.cs b/src/TesApi.Web/IAzureProxy.cs index 52334707e..03b5bc022 100644 --- a/src/TesApi.Web/IAzureProxy.cs +++ b/src/TesApi.Web/IAzureProxy.cs @@ -6,7 +6,6 @@ using System.Threading; using System.Threading.Tasks; using Microsoft.Azure.Batch; -using Microsoft.Azure.Batch.Common; using TesApi.Web.Storage; using BatchModels = Microsoft.Azure.Management.Batch.Models; @@ -137,8 +136,8 @@ public interface IAzureProxy /// /// Directory Uri /// A for controlling the lifetime of the asynchronous operation. - /// List of blob paths - IAsyncEnumerable<(string Name, Uri Uri)> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken); + /// List of blob names and URLs without SAS tokens. + IAsyncEnumerable ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken); /// /// Fetches the blobs properties diff --git a/src/TesApi.Web/Management/IBatchQuotaProvider.cs b/src/TesApi.Web/Management/IBatchQuotaProvider.cs index d1e176374..4f8547aaf 100644 --- a/src/TesApi.Web/Management/IBatchQuotaProvider.cs +++ b/src/TesApi.Web/Management/IBatchQuotaProvider.cs @@ -40,12 +40,5 @@ Task GetQuotaForRequirementAsync( /// A for controlling the lifetime of the asynchronous operation. /// Task GetPoolAndJobQuotaAsync(CancellationToken cancellationToken); - - /// - /// Pool and Active Job and JobSchedule quotas. - /// - /// Pool quota. - /// Active Job and JobSchedule quota. - public record struct PoolAndJobQuota(int PoolQuota, int ActiveJobAndJobScheduleQuota); } } diff --git a/src/TesApi.Web/Management/IBatchQuotaVerifier.cs b/src/TesApi.Web/Management/IBatchQuotaVerifier.cs index fb5179edd..8c9d861a4 100644 --- a/src/TesApi.Web/Management/IBatchQuotaVerifier.cs +++ b/src/TesApi.Web/Management/IBatchQuotaVerifier.cs @@ -1,10 +1,10 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -using System; using System.Threading; using System.Threading.Tasks; using Tes.Models; +using TesApi.Web.Management.Models.Quotas; namespace TesApi.Web.Management { @@ -37,12 +37,5 @@ public interface IBatchQuotaVerifier /// /// Batch quota provider IBatchQuotaProvider GetBatchQuotaProvider(); - - /// - /// Result of group checking quota for pools and jobs. - /// - /// The number of pools or jobs above the "required" request that exceeded the available quota. - /// The to return to the tasks that could not be accomodated. - public record struct CheckGroupPoolAndJobQuotaResult(int Exceeded, Exception Exception); } } diff --git a/src/TesApi.Web/Management/Models/Quotas/CheckGroupPoolAndJobQuotaResult.cs b/src/TesApi.Web/Management/Models/Quotas/CheckGroupPoolAndJobQuotaResult.cs new file mode 100644 index 000000000..985e02390 --- /dev/null +++ b/src/TesApi.Web/Management/Models/Quotas/CheckGroupPoolAndJobQuotaResult.cs @@ -0,0 +1,14 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System; + +namespace TesApi.Web.Management.Models.Quotas +{ + /// + /// Result of group checking quota for pools and jobs. + /// + /// The number of pools or jobs above the "required" request that exceeded the available quota. + /// The to return to the tasks that could not be accomodated. + public record struct CheckGroupPoolAndJobQuotaResult(int Exceeded, Exception Exception); +} diff --git a/src/TesApi.Web/Management/Models/Quotas/PoolAndJobQuota.cs b/src/TesApi.Web/Management/Models/Quotas/PoolAndJobQuota.cs new file mode 100644 index 000000000..7e5d7d80d --- /dev/null +++ b/src/TesApi.Web/Management/Models/Quotas/PoolAndJobQuota.cs @@ -0,0 +1,12 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace TesApi.Web.Management.Models.Quotas +{ + /// + /// Pool and Active Job and JobSchedule quotas. + /// + /// Pool quota. + /// Active Job and JobSchedule quota. + public record struct PoolAndJobQuota(int PoolQuota, int ActiveJobAndJobScheduleQuota); +} diff --git a/src/TesApi.Web/Storage/BlobNameAndUri.cs b/src/TesApi.Web/Storage/BlobNameAndUri.cs new file mode 100644 index 000000000..f1ca3361c --- /dev/null +++ b/src/TesApi.Web/Storage/BlobNameAndUri.cs @@ -0,0 +1,14 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System; + +namespace TesApi.Web.Storage +{ + /// + /// A storage blob's BlobName and Uri without SAS token. + /// + /// The "BlobName" (without container name or account name) of the storage blob. + /// The URL of the storage blob without any SAS token. + public record struct BlobNameAndUri(string BlobName, Uri BlobUri); +} diff --git a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs index f6d7da10e..482f55225 100644 --- a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs @@ -126,6 +126,16 @@ public override async Task MapLocalPathToSasUrlAsync(string path, BlobSa } } + /// + /// Generates SAS token for storage blobs. + /// + /// Target of SAS token. + /// Length of time from now for which SAS token should remain valid. + /// Requested permissions to be included in the returned token. + /// A for controlling the lifetime of the asynchronous operation. + /// Logging metadata for failures locating storage account. + /// A targeting with the SAS token. + /// private Task AddSasTokenAsync(StorageAccountUrlSegments pathSegments, TimeSpan? sasTokenDuration, BlobSasPermissions blobPermissions, CancellationToken cancellationToken, string path = default) { if (pathSegments.IsContainer) @@ -136,6 +146,16 @@ private Task AddSasTokenAsync(StorageAccountUrlSegmen return AddSasTokenAsyncImpl(pathSegments, sasTokenDuration, (expiresOn, blobName) => new BlobSasBuilder(blobPermissions, expiresOn) { BlobName = blobPermissions.HasFlag(BlobSasPermissions.List) ? string.Empty : blobName }, path, cancellationToken); } + /// + /// Generates SAS token for storage blob containers. + /// + /// Target of SAS token. + /// Length of time from now for which SAS token should remain valid. + /// Requested permissions to be included in the returned token. + /// A for controlling the lifetime of the asynchronous operation. + /// Logging metadata for failures locating storage account. + /// A targeting with the SAS token. + /// private Task AddSasTokenAsync(StorageAccountUrlSegments pathSegments, TimeSpan? sasTokenDuration, BlobContainerSasPermissions containerPermissions, CancellationToken cancellationToken, string path = default) { if (!pathSegments.IsContainer) @@ -147,13 +167,13 @@ private Task AddSasTokenAsync(StorageAccountUrlSegmen } /// - /// Generates SAS token for both blobs and containers. + /// Generates SAS token for both blobs and containers. Intended to be called from methods like and . /// /// Target of SAS token. /// Length of time from now for which SAS token should remain valid. /// A factory that generates a . Receives the expiration time and the blobName, which should be set on the sas builder as appropriate. /// Logging metadata for failures locating storage account. - /// + /// A for controlling the lifetime of the asynchronous operation. /// /// private async Task AddSasTokenAsyncImpl(StorageAccountUrlSegments pathSegments, TimeSpan? sasTokenDuration, Func createBuilder, string path, CancellationToken cancellationToken) diff --git a/src/TesApi.Web/Storage/IStorageAccessProvider.cs b/src/TesApi.Web/Storage/IStorageAccessProvider.cs index 9d6b485b7..13b205c4a 100644 --- a/src/TesApi.Web/Storage/IStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/IStorageAccessProvider.cs @@ -100,7 +100,7 @@ public interface IStorageAccessProvider /// - https://accountName.blob.core.windows.net/containerName/blobName /// /// The file path to convert. Two-part path is treated as container path. Paths with three or more parts are treated as blobs. - /// + /// Requested permissions to include in the SAS token. /// A for controlling the lifetime of the asynchronous operation. /// Duration SAS should be valid. /// An Azure Block Blob or Container URL with SAS token @@ -109,35 +109,35 @@ public interface IStorageAccessProvider /// /// Returns an Azure Storage Blob URL with a SAS token for the specified blob path in the TES internal storage location /// - /// - /// - /// - /// + /// A relative path within the blob storage space reserved for the TES server. + /// Requested permissions to include in the SAS token. + /// A for controlling the lifetime of the asynchronous operation. + /// A blob storage URL with SAS token. public Task GetInternalTesBlobUrlAsync(string blobPath, BlobSasPermissions sasPermissions, CancellationToken cancellationToken); /// /// Returns an Azure Storage Blob URL with a SAS token for the specified blob path in the TES task internal storage location. /// - /// - /// - /// - /// - /// + /// A + /// A relative path within the blob storage space reserved for the . + /// Requested permissions to include in the SAS token. + /// A for controlling the lifetime of the asynchronous operation. + /// A blob storage URL with SAS token. public Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, BlobSasPermissions sasPermissions, CancellationToken cancellationToken); /// /// Returns an Azure Storage Blob URL without a SAS token for the specified blob path in the TES task internal storage location. /// - /// - /// - /// + /// A + /// A relative path within the blob storage space reserved for the . + /// A blob storage URL. public string GetInternalTesTaskBlobUrlWithoutSasToken(TesTask task, string blobPath); /// /// Returns an Azure Storage Blob URL without a SAS token for the specified blob path in the TES internal storage location. /// - /// - /// + /// A relative path within the blob storage space reserved for the TES server. + /// A blob storage URL. public string GetInternalTesBlobUrlWithoutSasToken(string blobPath); } } From 520c0e064d3536fc7857db8668df88c2595bdb7c Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 27 Oct 2023 00:58:55 -0700 Subject: [PATCH 055/202] Cleanup --- .../TestServices/TestServiceProvider.cs | 5 +- src/TesApi.Web/BatchScheduler.cs | 13 +- src/TesApi.Web/Events/TaskNodeEventMessage.cs | 43 ++++ ...ntMessage.cs => TaskNodeEventProcessor.cs} | 205 +++++++++++------- src/TesApi.Web/IBatchScheduler.cs | 2 +- src/TesApi.Web/Scheduler.cs | 55 +++-- src/TesApi.Web/Startup.cs | 3 +- 7 files changed, 209 insertions(+), 117 deletions(-) create mode 100644 src/TesApi.Web/Events/TaskNodeEventMessage.cs rename src/TesApi.Web/Events/{NodeEventMessage.cs => TaskNodeEventProcessor.cs} (62%) diff --git a/src/TesApi.Tests/TestServices/TestServiceProvider.cs b/src/TesApi.Tests/TestServices/TestServiceProvider.cs index deb48ad4f..d69aad472 100644 --- a/src/TesApi.Tests/TestServices/TestServiceProvider.cs +++ b/src/TesApi.Tests/TestServices/TestServiceProvider.cs @@ -84,15 +84,14 @@ internal TestServiceProvider( .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) - .AddTransient>(_ => NullLogger.Instance) + .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) .AddSingleton() .AddSingleton() .AddSingleton() .AddSingleton>(s => () => s.GetService()) .AddTransient() - .AddSingleton, string, NodeEventMessage>>(s => (blobAbsoluteUri, tags, @event) => ActivatorUtilities.CreateInstance(s, blobAbsoluteUri, tags, @event)) - .AddTransient() + .AddSingleton() .AddSingleton() .AddSingleton(s => GetArmBatchQuotaProvider(s, armBatchQuotaProvider)) //added so config utils gets the arm implementation, to be removed once config utils is refactored. .AddSingleton() diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index a58f8ddd2..a70c149eb 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -74,7 +74,6 @@ public partial class BatchScheduler : IBatchScheduler private readonly ContainerRegistryProvider containerRegistryProvider; private readonly string batchPrefix; private readonly Func batchPoolFactory; - private readonly Func, string, NodeEventMessage> batchTesEventMessageFactory; private readonly IAllowedVmSizesService allowedVmSizesService; private readonly TaskExecutionScriptingManager taskExecutionScriptingManager; @@ -97,7 +96,6 @@ public partial class BatchScheduler : IBatchScheduler /// factory. /// Service to get allowed vm sizes. /// . - /// factory. public BatchScheduler( ILogger logger, IOptions batchGen1Options, @@ -114,8 +112,7 @@ public BatchScheduler( ContainerRegistryProvider containerRegistryProvider, Func poolFactory, IAllowedVmSizesService allowedVmSizesService, - TaskExecutionScriptingManager taskExecutionScriptingManager, - Func, string, NodeEventMessage> batchTesEventMessageFactory) + TaskExecutionScriptingManager taskExecutionScriptingManager) { ArgumentNullException.ThrowIfNull(logger); ArgumentNullException.ThrowIfNull(azureProxy); @@ -125,7 +122,6 @@ public BatchScheduler( ArgumentNullException.ThrowIfNull(containerRegistryProvider); ArgumentNullException.ThrowIfNull(poolFactory); ArgumentNullException.ThrowIfNull(taskExecutionScriptingManager); - ArgumentNullException.ThrowIfNull(batchTesEventMessageFactory); this.logger = logger; this.azureProxy = azureProxy; @@ -147,7 +143,6 @@ public BatchScheduler( this.globalManagedIdentity = batchNodesOptions.Value.GlobalManagedIdentity; this.allowedVmSizesService = allowedVmSizesService; this.taskExecutionScriptingManager = taskExecutionScriptingManager; - this.batchTesEventMessageFactory = batchTesEventMessageFactory; batchPoolFactory = poolFactory; batchPrefix = batchSchedulingOptions.Value.Prefix; @@ -1340,7 +1335,7 @@ private static Dictionary DelimitedTextToDictionary(string text, /// - public async IAsyncEnumerable GetEventMessagesAsync([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken, string @event) + public async IAsyncEnumerable GetEventMessagesAsync([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken, string @event) { const string eventsFolderName = "events"; var prefix = eventsFolderName + "/"; @@ -1363,7 +1358,7 @@ public async IAsyncEnumerable GetEventMessagesAsync([System.Ru cancellationToken) .WithCancellation(cancellationToken)) { - if (blobItem.Tags.ContainsKey(NodeEventMessage.ProcessedTag) || !blobItem.Tags.ContainsKey("task-id")) + if (blobItem.Tags.ContainsKey(TaskNodeEventProcessor.ProcessedTag) || !blobItem.Tags.ContainsKey("task-id")) { continue; } @@ -1373,7 +1368,7 @@ public async IAsyncEnumerable GetEventMessagesAsync([System.Ru var pathFromEventName = blobItem.Name[eventsEndIndex..]; var eventName = pathFromEventName[..pathFromEventName.IndexOf('/')]; - yield return batchTesEventMessageFactory(new(blobUrl), blobItem.Tags, eventName); + yield return new(new(blobUrl), blobItem.Tags, eventName); } } diff --git a/src/TesApi.Web/Events/TaskNodeEventMessage.cs b/src/TesApi.Web/Events/TaskNodeEventMessage.cs new file mode 100644 index 000000000..1f03a07e6 --- /dev/null +++ b/src/TesApi.Web/Events/TaskNodeEventMessage.cs @@ -0,0 +1,43 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Collections.Generic; +using System; + +namespace TesApi.Web.Events +{ + /// + /// from blob storage for processing by TES server. + /// + /// URL of the event message. + /// Tags on the event message blob. + /// Name of the event based on parsing the blob's BlobName. + public record class TaskNodeEventMessage(Uri BlobUri, IDictionary Tags, string Event) + { + /// + /// + /// + public Tes.Runner.Events.EventMessage RunnerEventMessage { get; private set; } + + /// + /// + /// + public string TesTaskId => RunnerEventMessage?.EntityId; + + /// + /// Sets . + /// + /// The downloaded event message associated with this storage blob. + public void SetRunnerEventMessage(Tes.Runner.Events.EventMessage eventMessage) + { + ArgumentNullException.ThrowIfNull(eventMessage); + + if (RunnerEventMessage is not null) + { + throw new InvalidOperationException("RunnerEventMessage has already been set."); + } + + RunnerEventMessage = eventMessage; + } + } +} diff --git a/src/TesApi.Web/Events/NodeEventMessage.cs b/src/TesApi.Web/Events/TaskNodeEventProcessor.cs similarity index 62% rename from src/TesApi.Web/Events/NodeEventMessage.cs rename to src/TesApi.Web/Events/TaskNodeEventProcessor.cs index ef70ccdee..4e27a2516 100644 --- a/src/TesApi.Web/Events/NodeEventMessage.cs +++ b/src/TesApi.Web/Events/TaskNodeEventProcessor.cs @@ -14,7 +14,7 @@ namespace TesApi.Web.Events /// /// Represents an event sent by the node task runner. /// - public class NodeEventMessage + public class TaskNodeEventProcessor { /// /// Blob tag used to record event processing. @@ -24,72 +24,79 @@ public class NodeEventMessage private readonly IStorageAccessProvider _storageAccessProvider; private readonly IAzureProxy _azureProxy; private readonly ILogger _logger; - private readonly Uri _uri; /// - /// Tags of this event message. - /// - public IDictionary Tags { get; } - - /// - /// Event of this event message. - /// - public string Event { get; } - - /// - /// Constructor of . + /// Constructor of . /// /// /// /// - /// - /// - /// - public NodeEventMessage(IAzureProxy azureProxy, ILogger logger, IStorageAccessProvider storageAccessProvider, Uri blobAbsoluteUri, IDictionary tags, string @event) + public TaskNodeEventProcessor(IAzureProxy azureProxy, ILogger logger, IStorageAccessProvider storageAccessProvider) { ArgumentNullException.ThrowIfNull(azureProxy); ArgumentNullException.ThrowIfNull(storageAccessProvider); - ArgumentNullException.ThrowIfNull(blobAbsoluteUri); - ArgumentNullException.ThrowIfNull(tags); - ArgumentNullException.ThrowIfNull(@event); - if (tags.Count == 0) + _azureProxy = azureProxy; + _logger = logger; + _storageAccessProvider = storageAccessProvider; + } + + + /// + /// TODO + /// + /// + /// + public void ValidateMessageMetadata(TaskNodeEventMessage message) + { + ArgumentNullException.ThrowIfNull(message); + + if (message.BlobUri is null) { - throw new ArgumentException("This message has no tags.", nameof(tags)); + throw new ArgumentException("This message's URL is missing.", nameof(message)); } - if (tags.ContainsKey(ProcessedTag)) + if (message.Tags is null) { - throw new ArgumentException("This message was already processed.", nameof(tags)); + throw new ArgumentException("This message's Tags are missing.", nameof(message)); + } + + if (string.IsNullOrWhiteSpace(message.Event)) + { + throw new ArgumentException("This message's event type is missing.", nameof(message)); + } + + if (message.Tags.Count == 0) + { + throw new ArgumentException("This message has no tags.", nameof(message)); + } + + if (message.Tags.ContainsKey(ProcessedTag)) + { + throw new ArgumentException("This message was already processed.", nameof(message)); } // There are up to 10 tags allowed. We will be adding one. // https://learn.microsoft.com/azure/storage/blobs/storage-manage-find-blobs?tabs=azure-portal#setting-blob-index-tags - if (tags.Count > 9) + if (message.Tags.Count > 9) { - throw new ArgumentException("This message does not have space to add the processed tag.", nameof(tags)); + throw new ArgumentException("This message does not have space to add the processed tag.", nameof(message)); } - - _azureProxy = azureProxy; - _logger = logger; - _storageAccessProvider = storageAccessProvider; - _uri = blobAbsoluteUri; - Tags = tags.AsReadOnly(); - Event = @event; } /// /// Gets the details of this event message. /// + /// /// /// - public async Task<(string Id, AzureBatchTaskState State)> GetMessageBatchStateAsync(CancellationToken cancellationToken) + public async Task DownloadAndValidateMessageContentAsync(TaskNodeEventMessage message, CancellationToken cancellationToken) { - Tes.Runner.Events.EventMessage result = null; + Tes.Runner.Events.EventMessage result; try { - var messageText = await _azureProxy.DownloadBlobAsync(_uri, cancellationToken); + var messageText = await _azureProxy.DownloadBlobAsync(message.BlobUri, cancellationToken); result = System.Text.Json.JsonSerializer.Deserialize(messageText) ?? throw new InvalidOperationException("Deserialize() returned null."); } @@ -102,7 +109,7 @@ public NodeEventMessage(IAzureProxy azureProxy, ILogger logger System.Diagnostics.Debug.Assert(Tes.Runner.Events.EventsPublisher.EventVersion.Equals(result.EventVersion, StringComparison.Ordinal)); System.Diagnostics.Debug.Assert(Tes.Runner.Events.EventsPublisher.EventDataVersion.Equals(result.EventDataVersion, StringComparison.Ordinal)); System.Diagnostics.Debug.Assert(Tes.Runner.Events.EventsPublisher.TesTaskRunnerEntityType.Equals(result.EntityType, StringComparison.Ordinal)); - System.Diagnostics.Debug.Assert(Event.Equals(result.Name, StringComparison.Ordinal)); + System.Diagnostics.Debug.Assert(message.Event.Equals(result.Name, StringComparison.Ordinal)); // Event type specific validations switch (result.Name) @@ -140,33 +147,51 @@ public NodeEventMessage(IAzureProxy azureProxy, ILogger logger break; } - _logger.LogDebug("Getting batch task state from event {EventName} for {TesTask}.", result.Name ?? Event, result.EntityId); - return (result.EntityId, GetBatchTaskState(result)); + message.SetRunnerEventMessage(result); + } + + private enum EventsInOrder + { + downloadStart, + downloadEnd, + executorStart, + executorEnd, + uploadStart, + uploadEnd, + taskCompleted, } /// - /// Marks this event message processed. + /// Returns a sequence in the order the events were produced. /// - /// + /// + /// + /// /// - public async Task MarkMessageProcessed(CancellationToken cancellationToken) + public IEnumerable OrderProcessedByExecutorSequence(IEnumerable source, Func messageGetter) { - await _azureProxy.SetBlobTags( - _uri, - Tags - .Append(new(ProcessedTag, DateTime.UtcNow.ToString("O"))) - .ToDictionary(pair => pair.Key, pair => pair.Value), - cancellationToken); + return source.OrderBy(t => messageGetter(t).RunnerEventMessage.Created).ThenBy(t => Enum.TryParse(typeof(EventsInOrder), messageGetter(t).RunnerEventMessage.Name, true, out var result) ? result : -1); } - private AzureBatchTaskState GetBatchTaskState(Tes.Runner.Events.EventMessage message) + /// + /// Gets the task status details from this event message. + /// + /// + /// + public AzureBatchTaskState GetMessageBatchState(TaskNodeEventMessage message) { - return (message.Name ?? Event) switch + ArgumentNullException.ThrowIfNull(message); + ArgumentNullException.ThrowIfNull(message.RunnerEventMessage, nameof(message)); + + var nodeMessage = message.RunnerEventMessage; + + _logger.LogDebug("Getting batch task state from event {EventName} for {TesTask}.", nodeMessage.Name ?? message.Event, nodeMessage.EntityId); + return (nodeMessage.Name ?? message.Event) switch { Tes.Runner.Events.EventsPublisher.DownloadStartEvent => new(AzureBatchTaskState.TaskState.NoChange, - BatchTaskStartTime: message.Created), + BatchTaskStartTime: nodeMessage.Created), - Tes.Runner.Events.EventsPublisher.DownloadEndEvent => message.StatusMessage switch + Tes.Runner.Events.EventsPublisher.DownloadEndEvent => nodeMessage.StatusMessage switch { Tes.Runner.Events.EventsPublisher.SuccessStatus => new(AzureBatchTaskState.TaskState.NoChange), @@ -175,88 +200,104 @@ private AzureBatchTaskState GetBatchTaskState(Tes.Runner.Events.EventMessage mes Failure: new("SystemError", Enumerable.Empty() .Append("Download failed.") - .Append(message.EventData["errorMessage"]))), + .Append(nodeMessage.EventData["errorMessage"]))), _ => throw new System.Diagnostics.UnreachableException(), }, Tes.Runner.Events.EventsPublisher.ExecutorStartEvent => new(AzureBatchTaskState.TaskState.Running, - ExecutorStartTime: message.Created), + ExecutorStartTime: nodeMessage.Created), - Tes.Runner.Events.EventsPublisher.ExecutorEndEvent => message.StatusMessage switch + Tes.Runner.Events.EventsPublisher.ExecutorEndEvent => nodeMessage.StatusMessage switch { Tes.Runner.Events.EventsPublisher.SuccessStatus => new( AzureBatchTaskState.TaskState.InfoUpdate, - ExecutorEndTime: message.Created, - ExecutorExitCode: int.Parse(message.EventData["exitCode"])), + ExecutorEndTime: nodeMessage.Created, + ExecutorExitCode: int.Parse(nodeMessage.EventData["exitCode"])), Tes.Runner.Events.EventsPublisher.FailedStatus => new( AzureBatchTaskState.TaskState.InfoUpdate, Failure: new("ExecutorError", Enumerable.Empty() - .Append(message.EventData["errorMessage"])), - ExecutorEndTime: message.Created, - ExecutorExitCode: int.Parse(message.EventData["exitCode"])), + .Append(nodeMessage.EventData["errorMessage"])), + ExecutorEndTime: nodeMessage.Created, + ExecutorExitCode: int.Parse(nodeMessage.EventData["exitCode"])), _ => throw new System.Diagnostics.UnreachableException(), }, Tes.Runner.Events.EventsPublisher.UploadStartEvent => new(AzureBatchTaskState.TaskState.NoChange), - Tes.Runner.Events.EventsPublisher.UploadEndEvent => message.StatusMessage switch + Tes.Runner.Events.EventsPublisher.UploadEndEvent => nodeMessage.StatusMessage switch { Tes.Runner.Events.EventsPublisher.SuccessStatus => new( AzureBatchTaskState.TaskState.InfoUpdate, - OutputFileLogs: GetFileLogs(message.EventData)), + OutputFileLogs: GetFileLogs(nodeMessage.EventData)), Tes.Runner.Events.EventsPublisher.FailedStatus => new( AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed, Failure: new("SystemError", Enumerable.Empty() .Append("Upload failed.") - .Append(message.EventData["errorMessage"]))), + .Append(nodeMessage.EventData["errorMessage"]))), _ => throw new System.Diagnostics.UnreachableException(), }, - Tes.Runner.Events.EventsPublisher.TaskCompletionEvent => message.StatusMessage switch + Tes.Runner.Events.EventsPublisher.TaskCompletionEvent => nodeMessage.StatusMessage switch { Tes.Runner.Events.EventsPublisher.SuccessStatus => new( AzureBatchTaskState.TaskState.CompletedSuccessfully, - BatchTaskStartTime: message.Created - TimeSpan.Parse(message.EventData["duration"]), - BatchTaskEndTime: message.Created), + BatchTaskStartTime: nodeMessage.Created - TimeSpan.Parse(nodeMessage.EventData["duration"]), + BatchTaskEndTime: nodeMessage.Created), Tes.Runner.Events.EventsPublisher.FailedStatus => new( AzureBatchTaskState.TaskState.CompletedWithErrors, Failure: new("SystemError", Enumerable.Empty() .Append("Node script failed.") - .Append(message.EventData["errorMessage"])), - BatchTaskStartTime: message.Created - TimeSpan.Parse(message.EventData["duration"]), - BatchTaskEndTime: message.Created), + .Append(nodeMessage.EventData["errorMessage"])), + BatchTaskStartTime: nodeMessage.Created - TimeSpan.Parse(nodeMessage.EventData["duration"]), + BatchTaskEndTime: nodeMessage.Created), _ => throw new System.Diagnostics.UnreachableException(), }, _ => throw new System.Diagnostics.UnreachableException(), }; - } - private static IEnumerable GetFileLogs(IDictionary eventData) - { - if (eventData is null) + static IEnumerable GetFileLogs(IDictionary eventData) { - yield break; - } + if (eventData is null) + { + yield break; + } - var numberOfFiles = int.Parse(eventData["numberOfFiles"]); - for (var i = 0; i < numberOfFiles; ++i) - { - yield return new( - new Uri(eventData[$"fileUri-{i}"]), - eventData[$"filePath-{i}"], - long.Parse(eventData[$"fileSize-{i}"])); + var numberOfFiles = int.Parse(eventData["numberOfFiles"]); + for (var i = 0; i < numberOfFiles; ++i) + { + yield return new( + new Uri(eventData[$"fileUri-{i}"]), + eventData[$"filePath-{i}"], + long.Parse(eventData[$"fileSize-{i}"])); + } } } + + /// + /// Marks this event message processed. + /// + /// + /// + /// + public async Task MarkMessageProcessedAsync(TaskNodeEventMessage message, CancellationToken cancellationToken) + { + await _azureProxy.SetBlobTags( + message.BlobUri, + message.Tags + .Append(new(ProcessedTag, DateTime.UtcNow.ToString("O"))) + .ToDictionary(pair => pair.Key, pair => pair.Value), + cancellationToken); + } } } diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index 4e8d06e4d..a7d5f20e2 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -108,6 +108,6 @@ public interface IBatchScheduler /// A for controlling the lifetime of the asynchronous operation. /// Optional event to retrieve. Defaults to all events. /// - IAsyncEnumerable GetEventMessagesAsync(CancellationToken cancellationToken, string @event = default); + IAsyncEnumerable GetEventMessagesAsync(CancellationToken cancellationToken, string @event = default); } } diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index c8622ff4c..34be35bda 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -23,16 +23,21 @@ internal class Scheduler : OrchestrateOnBatchSchedulerServiceBase { private readonly TimeSpan blobRunInterval = TimeSpan.FromSeconds(5); private readonly TimeSpan batchRunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs uses a 30 second polling interval + private readonly TaskNodeEventProcessor nodeEventProcessor; /// /// Default constructor /// + /// The task node event processor. /// Used for requesting termination of the current application during initialization. /// The main TES task database repository implementation /// The batch scheduler implementation /// The logger instance - public Scheduler(Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger logger) - : base(hostApplicationLifetime, repository, batchScheduler, logger) { } + public Scheduler(TaskNodeEventProcessor nodeEventProcessor, Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger logger) + : base(hostApplicationLifetime, repository, batchScheduler, logger) + { + this.nodeEventProcessor = nodeEventProcessor; + } /// @@ -158,21 +163,37 @@ private Task ExecuteUpdateTesTaskFromEventBlobAsync(CancellationToken stoppingTo /// async ValueTask UpdateTesTasksFromEventBlobsAsync(CancellationToken stoppingToken) { - var messageInfos = new ConcurrentBag(); - var messages = new ConcurrentBag<(string Id, AzureBatchTaskState State)>(); + Func> getEventsInOrder; - // Get and parse event blobs - await foreach (var message in batchScheduler.GetEventMessagesAsync(stoppingToken) - .WithCancellation(stoppingToken)) { - messageInfos.Add(message); - } + var messageInfos = new ConcurrentBag(); + var messages = new ConcurrentBag<(TaskNodeEventMessage Message, AzureBatchTaskState State)>(); - try - { + // Get and parse event blobs + await foreach (var message in batchScheduler.GetEventMessagesAsync(stoppingToken) + .WithCancellation(stoppingToken)) + { + messageInfos.Add(message); + } + + //try + //{ await Parallel.ForEachAsync(messageInfos, ProcessMessage); + //} + //catch { } // TODO: identify exceptions + + async ValueTask ProcessMessage(TaskNodeEventMessage messageInfo, CancellationToken cancellationToken) + { + nodeEventProcessor.ValidateMessageMetadata(messageInfo); + await nodeEventProcessor.DownloadAndValidateMessageContentAsync(messageInfo, cancellationToken); + messages.Add((messageInfo, nodeEventProcessor.GetMessageBatchState(messageInfo))); + await nodeEventProcessor.MarkMessageProcessedAsync(messageInfo, cancellationToken); + } + + getEventsInOrder = () => nodeEventProcessor.OrderProcessedByExecutorSequence(messages, item => item.Message); } - catch { } // TODO: identify exceptions + + var orderedMessageList = getEventsInOrder().ToList(); // Update TesTasks await OrchestrateTesTasksOnBatchAsync( @@ -180,20 +201,14 @@ await OrchestrateTesTasksOnBatchAsync( #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously async token => GetTesTasks(token), #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously - (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, messages.Select(t => t.State).ToArray(), token), + (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, orderedMessageList.Select(t => t.State).ToArray(), token), stoppingToken, "events"); // Helpers - async ValueTask ProcessMessage(NodeEventMessage messageInfo, CancellationToken cancellationToken) - { - messages.Add(await messageInfo.GetMessageBatchStateAsync(cancellationToken)); - await messageInfo.MarkMessageProcessed(cancellationToken); - } - async IAsyncEnumerable GetTesTasks([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { - foreach (var id in messages.Select(t => batchScheduler.GetTesTaskIdFromCloudTaskId(t.Id))) + foreach (var id in orderedMessageList.Select(t => t.Message.TesTaskId)) { TesTask tesTask = default; if (await repository.TryGetItemAsync(id, cancellationToken, task => tesTask = task) && tesTask is not null) diff --git a/src/TesApi.Web/Startup.cs b/src/TesApi.Web/Startup.cs index 2ced4bf44..0810c12ee 100644 --- a/src/TesApi.Web/Startup.cs +++ b/src/TesApi.Web/Startup.cs @@ -81,8 +81,7 @@ public void ConfigureServices(IServiceCollection services) .AddSingleton, TesRepositoryCache>() .AddSingleton() .AddSingleton() - .AddTransient() - .AddSingleton, string, Events.NodeEventMessage>>(s => (blobAbsoluteUri, tags, @event) => ActivatorUtilities.CreateInstance(s, blobAbsoluteUri, tags, @event)) + .AddSingleton() .AddTransient() .AddSingleton>(services => () => services.GetService()) .AddSingleton(CreateBatchPoolManagerFromConfiguration) From 505f71ad3bb24654af60b7edd4e42bddd166fd25 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 27 Oct 2023 01:11:07 -0700 Subject: [PATCH 056/202] Formatting and renaming --- .../TestServices/TestServiceProvider.cs | 4 ++-- src/TesApi.Web/BatchScheduler.cs | 4 ++-- ...EventMessage.cs => RunnerEventsMessage.cs} | 4 ++-- ...tProcessor.cs => RunnerEventsProcessor.cs} | 21 +++++++------------ src/TesApi.Web/IBatchScheduler.cs | 2 +- src/TesApi.Web/Scheduler.cs | 12 +++++------ src/TesApi.Web/Startup.cs | 2 +- 7 files changed, 22 insertions(+), 27 deletions(-) rename src/TesApi.Web/Events/{TaskNodeEventMessage.cs => RunnerEventsMessage.cs} (92%) rename src/TesApi.Web/Events/{TaskNodeEventProcessor.cs => RunnerEventsProcessor.cs} (93%) diff --git a/src/TesApi.Tests/TestServices/TestServiceProvider.cs b/src/TesApi.Tests/TestServices/TestServiceProvider.cs index d69aad472..a75634130 100644 --- a/src/TesApi.Tests/TestServices/TestServiceProvider.cs +++ b/src/TesApi.Tests/TestServices/TestServiceProvider.cs @@ -84,14 +84,14 @@ internal TestServiceProvider( .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) - .AddTransient>(_ => NullLogger.Instance) + .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) .AddSingleton() .AddSingleton() .AddSingleton() .AddSingleton>(s => () => s.GetService()) .AddTransient() - .AddSingleton() + .AddSingleton() .AddSingleton() .AddSingleton(s => GetArmBatchQuotaProvider(s, armBatchQuotaProvider)) //added so config utils gets the arm implementation, to be removed once config utils is refactored. .AddSingleton() diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index a70c149eb..efcddea2a 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -1335,7 +1335,7 @@ private static Dictionary DelimitedTextToDictionary(string text, /// - public async IAsyncEnumerable GetEventMessagesAsync([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken, string @event) + public async IAsyncEnumerable GetEventMessagesAsync([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken, string @event) { const string eventsFolderName = "events"; var prefix = eventsFolderName + "/"; @@ -1358,7 +1358,7 @@ public async IAsyncEnumerable GetEventMessagesAsync([Syste cancellationToken) .WithCancellation(cancellationToken)) { - if (blobItem.Tags.ContainsKey(TaskNodeEventProcessor.ProcessedTag) || !blobItem.Tags.ContainsKey("task-id")) + if (blobItem.Tags.ContainsKey(RunnerEventsProcessor.ProcessedTag) || !blobItem.Tags.ContainsKey("task-id")) { continue; } diff --git a/src/TesApi.Web/Events/TaskNodeEventMessage.cs b/src/TesApi.Web/Events/RunnerEventsMessage.cs similarity index 92% rename from src/TesApi.Web/Events/TaskNodeEventMessage.cs rename to src/TesApi.Web/Events/RunnerEventsMessage.cs index 1f03a07e6..356260d32 100644 --- a/src/TesApi.Web/Events/TaskNodeEventMessage.cs +++ b/src/TesApi.Web/Events/RunnerEventsMessage.cs @@ -1,8 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -using System.Collections.Generic; using System; +using System.Collections.Generic; namespace TesApi.Web.Events { @@ -12,7 +12,7 @@ namespace TesApi.Web.Events /// URL of the event message. /// Tags on the event message blob. /// Name of the event based on parsing the blob's BlobName. - public record class TaskNodeEventMessage(Uri BlobUri, IDictionary Tags, string Event) + public record class RunnerEventsMessage(Uri BlobUri, IDictionary Tags, string Event) { /// /// diff --git a/src/TesApi.Web/Events/TaskNodeEventProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs similarity index 93% rename from src/TesApi.Web/Events/TaskNodeEventProcessor.cs rename to src/TesApi.Web/Events/RunnerEventsProcessor.cs index 4e27a2516..37c33da88 100644 --- a/src/TesApi.Web/Events/TaskNodeEventProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -7,38 +7,33 @@ using System.Threading; using System.Threading.Tasks; using Microsoft.Extensions.Logging; -using TesApi.Web.Storage; namespace TesApi.Web.Events { /// /// Represents an event sent by the node task runner. /// - public class TaskNodeEventProcessor + public class RunnerEventsProcessor { /// /// Blob tag used to record event processing. /// public const string ProcessedTag = "processed"; - private readonly IStorageAccessProvider _storageAccessProvider; private readonly IAzureProxy _azureProxy; private readonly ILogger _logger; /// - /// Constructor of . + /// Constructor of . /// /// /// - /// - public TaskNodeEventProcessor(IAzureProxy azureProxy, ILogger logger, IStorageAccessProvider storageAccessProvider) + public RunnerEventsProcessor(IAzureProxy azureProxy, ILogger logger) { ArgumentNullException.ThrowIfNull(azureProxy); - ArgumentNullException.ThrowIfNull(storageAccessProvider); _azureProxy = azureProxy; _logger = logger; - _storageAccessProvider = storageAccessProvider; } @@ -47,7 +42,7 @@ public TaskNodeEventProcessor(IAzureProxy azureProxy, ILogger /// /// - public void ValidateMessageMetadata(TaskNodeEventMessage message) + public void ValidateMessageMetadata(RunnerEventsMessage message) { ArgumentNullException.ThrowIfNull(message); @@ -90,7 +85,7 @@ public void ValidateMessageMetadata(TaskNodeEventMessage message) /// /// /// - public async Task DownloadAndValidateMessageContentAsync(TaskNodeEventMessage message, CancellationToken cancellationToken) + public async Task DownloadAndValidateMessageContentAsync(RunnerEventsMessage message, CancellationToken cancellationToken) { Tes.Runner.Events.EventMessage result; @@ -168,7 +163,7 @@ private enum EventsInOrder /// /// /// - public IEnumerable OrderProcessedByExecutorSequence(IEnumerable source, Func messageGetter) + public IEnumerable OrderProcessedByExecutorSequence(IEnumerable source, Func messageGetter) { return source.OrderBy(t => messageGetter(t).RunnerEventMessage.Created).ThenBy(t => Enum.TryParse(typeof(EventsInOrder), messageGetter(t).RunnerEventMessage.Name, true, out var result) ? result : -1); } @@ -178,7 +173,7 @@ public IEnumerable OrderProcessedByExecutorSequence(IEnumerable source, /// /// /// - public AzureBatchTaskState GetMessageBatchState(TaskNodeEventMessage message) + public AzureBatchTaskState GetMessageBatchState(RunnerEventsMessage message) { ArgumentNullException.ThrowIfNull(message); ArgumentNullException.ThrowIfNull(message.RunnerEventMessage, nameof(message)); @@ -290,7 +285,7 @@ public AzureBatchTaskState GetMessageBatchState(TaskNodeEventMessage message) /// /// /// - public async Task MarkMessageProcessedAsync(TaskNodeEventMessage message, CancellationToken cancellationToken) + public async Task MarkMessageProcessedAsync(RunnerEventsMessage message, CancellationToken cancellationToken) { await _azureProxy.SetBlobTags( message.BlobUri, diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index a7d5f20e2..3f1c41711 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -108,6 +108,6 @@ public interface IBatchScheduler /// A for controlling the lifetime of the asynchronous operation. /// Optional event to retrieve. Defaults to all events. /// - IAsyncEnumerable GetEventMessagesAsync(CancellationToken cancellationToken, string @event = default); + IAsyncEnumerable GetEventMessagesAsync(CancellationToken cancellationToken, string @event = default); } } diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index 34be35bda..e8ca6a235 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -23,7 +23,7 @@ internal class Scheduler : OrchestrateOnBatchSchedulerServiceBase { private readonly TimeSpan blobRunInterval = TimeSpan.FromSeconds(5); private readonly TimeSpan batchRunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs uses a 30 second polling interval - private readonly TaskNodeEventProcessor nodeEventProcessor; + private readonly RunnerEventsProcessor nodeEventProcessor; /// /// Default constructor @@ -33,7 +33,7 @@ internal class Scheduler : OrchestrateOnBatchSchedulerServiceBase /// The main TES task database repository implementation /// The batch scheduler implementation /// The logger instance - public Scheduler(TaskNodeEventProcessor nodeEventProcessor, Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger logger) + public Scheduler(RunnerEventsProcessor nodeEventProcessor, Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger logger) : base(hostApplicationLifetime, repository, batchScheduler, logger) { this.nodeEventProcessor = nodeEventProcessor; @@ -163,11 +163,11 @@ private Task ExecuteUpdateTesTaskFromEventBlobAsync(CancellationToken stoppingTo /// async ValueTask UpdateTesTasksFromEventBlobsAsync(CancellationToken stoppingToken) { - Func> getEventsInOrder; + Func> getEventsInOrder; { - var messageInfos = new ConcurrentBag(); - var messages = new ConcurrentBag<(TaskNodeEventMessage Message, AzureBatchTaskState State)>(); + var messageInfos = new ConcurrentBag(); + var messages = new ConcurrentBag<(RunnerEventsMessage Message, AzureBatchTaskState State)>(); // Get and parse event blobs await foreach (var message in batchScheduler.GetEventMessagesAsync(stoppingToken) @@ -182,7 +182,7 @@ async ValueTask UpdateTesTasksFromEventBlobsAsync(CancellationToken stoppingToke //} //catch { } // TODO: identify exceptions - async ValueTask ProcessMessage(TaskNodeEventMessage messageInfo, CancellationToken cancellationToken) + async ValueTask ProcessMessage(RunnerEventsMessage messageInfo, CancellationToken cancellationToken) { nodeEventProcessor.ValidateMessageMetadata(messageInfo); await nodeEventProcessor.DownloadAndValidateMessageContentAsync(messageInfo, cancellationToken); diff --git a/src/TesApi.Web/Startup.cs b/src/TesApi.Web/Startup.cs index 0810c12ee..f74a42d56 100644 --- a/src/TesApi.Web/Startup.cs +++ b/src/TesApi.Web/Startup.cs @@ -81,7 +81,7 @@ public void ConfigureServices(IServiceCollection services) .AddSingleton, TesRepositoryCache>() .AddSingleton() .AddSingleton() - .AddSingleton() + .AddSingleton() .AddTransient() .AddSingleton>(services => () => services.GetService()) .AddSingleton(CreateBatchPoolManagerFromConfiguration) From f605f2e3850da414af61732813d9d7c36a00dc15 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 27 Oct 2023 12:47:35 -0700 Subject: [PATCH 057/202] more logging to find reason event metadata isn't populating task log --- src/TesApi.Web/AzureBatchTaskState.cs | 4 +- src/TesApi.Web/BatchPoolService.cs | 16 +++- src/TesApi.Web/BatchScheduler.cs | 87 +++++++++++++------ src/TesApi.Web/ConfigurationUtils.cs | 11 +-- src/TesApi.Web/Scheduler.cs | 62 ++++++++----- src/deploy-tes-on-azure/Deployer.cs | 52 ++--------- .../deploy-tes-on-azure.csproj | 1 - .../scripts/containers-to-mount | 32 ------- 8 files changed, 123 insertions(+), 142 deletions(-) delete mode 100644 src/deploy-tes-on-azure/scripts/containers-to-mount diff --git a/src/TesApi.Web/AzureBatchTaskState.cs b/src/TesApi.Web/AzureBatchTaskState.cs index 254fb2df8..459e039e8 100644 --- a/src/TesApi.Web/AzureBatchTaskState.cs +++ b/src/TesApi.Web/AzureBatchTaskState.cs @@ -21,6 +21,7 @@ namespace TesApi.Web /// /// . /// . + /// Warning. First item in enumeration is the Warning code, rest of items are additional system log entries. public record AzureBatchTaskState( TaskState State, IEnumerable OutputFileLogs = default, @@ -31,7 +32,8 @@ public record AzureBatchTaskState( DateTimeOffset? ExecutorEndTime = default, int? ExecutorExitCode = default, DateTimeOffset? BatchTaskEndTime = default, - int? BatchTaskExitCode = default) + int? BatchTaskExitCode = default, + IEnumerable Warning = default) { /// /// TesTask's state diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index ec560534c..76b1a67d6 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -71,17 +71,17 @@ private async ValueTask ExecuteActionOnPoolsAsync(string pollName, Func { try { - await action(pool, stoppingToken); + await action(pool, token); } catch (Exception exc) { logger.LogError(exc, @"Batch pool {PoolId} threw an exception in {Poll}.", pool.Id, pollName); } - } + }); logger.LogDebug(@"{Poll} for {PoolsCount} pools completed in {TotalSeconds} seconds.", pollName, pools.Count, DateTime.UtcNow.Subtract(startTime).TotalSeconds); } @@ -114,6 +114,11 @@ await ExecuteActionOnPoolsAsync( }, stoppingToken); + if (list.IsEmpty) + { + return; + } + await OrchestrateTesTasksOnBatchAsync( "Failures", #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously @@ -156,7 +161,10 @@ private async ValueTask ProcessCompletedCloudTasksAsync(CancellationToken stoppi await ExecuteActionOnPoolsAsync("Service Batch Tasks", async (pool, token) => await pool.GetCompletedTasks(token).ForEachAsync(tasks.Add, token), stoppingToken); - logger.LogDebug("ProcessCompletedCloudTasksAsync found {CompletedTasks} completed tasks.", tasks.Count); + if (tasks.IsEmpty) + { + return; + } await OrchestrateTesTasksOnBatchAsync( "Completed", diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index efcddea2a..d8eeb67d5 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -139,6 +139,7 @@ public BatchScheduler( this.disableBatchNodesPublicIpAddress = batchNodesOptions.Value.DisablePublicIpAddress; this.poolLifetime = TimeSpan.FromDays(batchSchedulingOptions.Value.PoolRotationForcedDays == 0 ? Options.BatchSchedulingOptions.DefaultPoolRotationForcedDays : batchSchedulingOptions.Value.PoolRotationForcedDays); this.defaultStorageAccountName = storageOptions.Value.DefaultAccountName; + logger.LogInformation(@"Default storage account: {DefaultStorageAccountName}", defaultStorageAccountName); this.globalStartTaskPath = StandardizeStartTaskPath(batchNodesOptions.Value.GlobalStartTask, this.defaultStorageAccountName); this.globalManagedIdentity = batchNodesOptions.Value.GlobalManagedIdentity; this.allowedVmSizesService = allowedVmSizesService; @@ -167,7 +168,7 @@ public BatchScheduler( BatchNodeAgentSkuId = batchGen1Options.Value.NodeAgentSkuId }; - logger.LogInformation($"usePreemptibleVmsOnly: {usePreemptibleVmsOnly}"); + logger.LogInformation(@"usePreemptibleVmsOnly: {UsePreemptibleVmsOnly}", usePreemptibleVmsOnly); static bool tesTaskIsQueuedInitializingOrRunning(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum || tesTask.State == TesState.INITIALIZINGEnum || tesTask.State == TesState.RUNNINGEnum; static bool tesTaskIsInitializingOrRunning(TesTask tesTask) => tesTask.State == TesState.INITIALIZINGEnum || tesTask.State == TesState.RUNNINGEnum; @@ -176,46 +177,80 @@ public BatchScheduler( static bool tesTaskCancellationRequested(TesTask tesTask) => tesTask.State == TesState.CANCELINGEnum; static bool tesTaskDeletionReady(TesTask tesTask) => tesTask.IsTaskDeletionRequired; + var setTaskStateLock = new object(); + async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - var (batchNodeMetrics, taskStartTime, taskEndTime, cromwellRcCode) = newTaskState == TesState.COMPLETEEnum - ? await GetBatchNodeMetricsAndCromwellResultCodeAsync(tesTask, cancellationToken) - : default; - - tesTask.State = newTaskState; + { + var newData = System.Text.Json.JsonSerializer.Serialize(batchInfo, new System.Text.Json.JsonSerializerOptions { DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingDefault }); - var tesTaskLog = tesTask.GetOrAddTesTaskLog(); - var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); + if ("{}".Equals(newData) && newTaskState == tesTask.State) + { + logger.LogDebug(@"For task {TesTask} there's nothing to change.", tesTask.Id); + return false; + } - tesTaskLog.BatchNodeMetrics = batchNodeMetrics; - tesTaskLog.CromwellResultCode = cromwellRcCode; - tesTaskLog.EndTime ??= taskEndTime ?? batchInfo.BatchTaskEndTime; - tesTaskLog.StartTime ??= taskStartTime ?? batchInfo.BatchTaskStartTime; - tesTaskLog.Outputs ??= batchInfo.OutputFileLogs?.Select(entry => new Tes.Models.TesOutputFileLog { Path = entry.Path, SizeBytes = $"{entry.Size}", Url = entry.Url.AbsoluteUri }).ToList(); - tesTaskExecutorLog.StartTime ??= batchInfo.ExecutorEndTime; - tesTaskExecutorLog.EndTime ??= batchInfo.ExecutorEndTime; - tesTaskExecutorLog.ExitCode ??= batchInfo.BatchTaskExitCode; + logger.LogDebug(@"Setting task {TesTask} with metadata {Metadata}.", tesTask.Id, newData); + } - // Only accurate when the task completes successfully, otherwise it's the Batch time as reported from Batch - // TODO this could get large; why? - //var timefromCoAScriptCompletionToBatchTaskDetectedComplete = tesTaskLog.EndTime - tesTaskExecutorLog.EndTime; + var(batchNodeMetrics, taskStartTime, taskEndTime, cromwellRcCode) = newTaskState == TesState.COMPLETEEnum + ? await GetBatchNodeMetricsAndCromwellResultCodeAsync(tesTask, cancellationToken) + : default; - if (batchInfo.Failure is not null) + lock (setTaskStateLock) { - tesTask.SetFailureReason(batchInfo.Failure.Reason); + tesTask.State = newTaskState; + + var tesTaskLog = tesTask.GetOrAddTesTaskLog(); + var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); + + tesTaskLog.BatchNodeMetrics = batchNodeMetrics; + tesTaskLog.CromwellResultCode = cromwellRcCode; + tesTaskLog.EndTime ??= taskEndTime ?? batchInfo.BatchTaskEndTime; + tesTaskLog.StartTime ??= taskStartTime ?? batchInfo.BatchTaskStartTime; + tesTaskLog.Outputs ??= batchInfo.OutputFileLogs?.Select(entry => new Tes.Models.TesOutputFileLog { Path = entry.Path, SizeBytes = $"{entry.Size}", Url = entry.Url.AbsoluteUri }).ToList(); + tesTaskExecutorLog.StartTime ??= batchInfo.ExecutorEndTime; + tesTaskExecutorLog.EndTime ??= batchInfo.ExecutorEndTime; + tesTaskExecutorLog.ExitCode ??= batchInfo.BatchTaskExitCode; + + // Only accurate when the task completes successfully, otherwise it's the Batch time as reported from Batch + // TODO this could get large; why? + //var timefromCoAScriptCompletionToBatchTaskDetectedComplete = tesTaskLog.EndTime - tesTaskExecutorLog.EndTime; - if (batchInfo.Failure.SystemLogs is not null) + if (batchInfo.Warning is not null) { - tesTask.AddToSystemLog(batchInfo.Failure.SystemLogs); + var warningInfo = batchInfo.Warning.ToList(); + switch (warningInfo.Count) + { + case 0: + break; + case 1: + tesTask.SetWarning(warningInfo[0]); + break; + default: + tesTask.SetWarning(warningInfo[0], warningInfo.Skip(1).ToArray()); + break; + } } - else if (!string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem)) + + if (batchInfo.Failure is not null) { - tesTask.AddToSystemLog(new[] { batchInfo.AlternateSystemLogItem }); + tesTask.SetFailureReason(batchInfo.Failure.Reason); + + if (batchInfo.Failure.SystemLogs is not null) + { + tesTask.AddToSystemLog(batchInfo.Failure.SystemLogs); + } + else if (!string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem)) + { + tesTask.AddToSystemLog(new[] { batchInfo.AlternateSystemLogItem }); + } } } if (!tesTask.IsActiveState()) { + logger.LogDebug(@"Uploading completed {TesTask}.", tesTask.Id); await taskExecutionScriptingManager.TryUploadServerTesTask(tesTask, "server-tes-task-completed.json", cancellationToken); } @@ -333,7 +368,7 @@ private async Task TerminateBatchTaskAsync(TesTask tesTask, CancellationTo /// The command to execute private string CreateWgetDownloadCommand(string urlToDownload, string localFilePathDownloadLocation, bool setExecutable = false) { - string command = $"wget --no-verbose --https-only --timeout=20 --waitretry=1 --tries=9 --retry-connrefused --continue -O {localFilePathDownloadLocation} '{urlToDownload}'"; + var command = $"wget --no-verbose --https-only --timeout=20 --waitretry=1 --tries=9 --retry-connrefused --continue -O {localFilePathDownloadLocation} '{urlToDownload}'"; if (setExecutable) { diff --git a/src/TesApi.Web/ConfigurationUtils.cs b/src/TesApi.Web/ConfigurationUtils.cs index 207182c3b..cdcc2fed7 100644 --- a/src/TesApi.Web/ConfigurationUtils.cs +++ b/src/TesApi.Web/ConfigurationUtils.cs @@ -6,13 +6,10 @@ using System.Linq; using System.Threading; using System.Threading.Tasks; -using Microsoft.Extensions.Configuration; using Microsoft.Extensions.Logging; -using Microsoft.Extensions.Options; using Tes.Models; using TesApi.Web.Management; using TesApi.Web.Management.Models.Quotas; -using TesApi.Web.Options; using TesApi.Web.Storage; namespace TesApi.Web @@ -22,7 +19,6 @@ namespace TesApi.Web /// public class ConfigurationUtils { - private readonly string defaultStorageAccountName; private readonly IStorageAccessProvider storageAccessProvider; private readonly ILogger logger; private readonly IBatchQuotaProvider quotaProvider; @@ -32,31 +28,28 @@ public class ConfigurationUtils /// /// The constructor /// - /// Configuration of /// /// > /// > /// /// public ConfigurationUtils( - IOptions defaultStorageOptions, IStorageAccessProvider storageAccessProvider, IBatchQuotaProvider quotaProvider, IBatchSkuInformationProvider skuInformationProvider, BatchAccountResourceInformation batchAccountResourceInformation, ILogger logger) { - ArgumentNullException.ThrowIfNull(storageAccessProvider); ArgumentNullException.ThrowIfNull(quotaProvider); ArgumentNullException.ThrowIfNull(batchAccountResourceInformation); if (string.IsNullOrEmpty(batchAccountResourceInformation.Region)) { throw new ArgumentException( - $"The batch information provided does not include region. Batch information:{batchAccountResourceInformation}"); + $"The batch information provided does not include region. Batch information:{batchAccountResourceInformation}", + nameof(batchAccountResourceInformation)); } ArgumentNullException.ThrowIfNull(logger); - this.defaultStorageAccountName = defaultStorageOptions.Value.DefaultAccountName; this.storageAccessProvider = storageAccessProvider; this.logger = logger; this.quotaProvider = quotaProvider; diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index e8ca6a235..c56decb57 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -155,7 +155,6 @@ private Task ExecuteUpdateTesTaskFromEventBlobAsync(CancellationToken stoppingTo stoppingToken); } - // TODO: Implement this /// /// Retrieves all event blobs from storage and updates the resultant state. /// @@ -163,38 +162,45 @@ private Task ExecuteUpdateTesTaskFromEventBlobAsync(CancellationToken stoppingTo /// async ValueTask UpdateTesTasksFromEventBlobsAsync(CancellationToken stoppingToken) { + var markEventsProcessedList = new ConcurrentBag>(); Func> getEventsInOrder; { - var messageInfos = new ConcurrentBag(); var messages = new ConcurrentBag<(RunnerEventsMessage Message, AzureBatchTaskState State)>(); // Get and parse event blobs - await foreach (var message in batchScheduler.GetEventMessagesAsync(stoppingToken) - .WithCancellation(stoppingToken)) + await Parallel.ForEachAsync(batchScheduler.GetEventMessagesAsync(stoppingToken), stoppingToken, async (eventMessage, cancellationToken) => { - messageInfos.Add(message); - } - - //try - //{ - await Parallel.ForEachAsync(messageInfos, ProcessMessage); - //} - //catch { } // TODO: identify exceptions + try + { + nodeEventProcessor.ValidateMessageMetadata(eventMessage); + await nodeEventProcessor.DownloadAndValidateMessageContentAsync(eventMessage, cancellationToken); + messages.Add((eventMessage, nodeEventProcessor.GetMessageBatchState(eventMessage))); + } + catch (Exception ex) + { + logger.LogError(ex, @""); + messages.Add((eventMessage, new(AzureBatchTaskState.TaskState.InfoUpdate, Warning: new List + { + "EventParsingFailed", + $"{ex.GetType().FullName}: {ex.Message}", + }))); + return; + } - async ValueTask ProcessMessage(RunnerEventsMessage messageInfo, CancellationToken cancellationToken) - { - nodeEventProcessor.ValidateMessageMetadata(messageInfo); - await nodeEventProcessor.DownloadAndValidateMessageContentAsync(messageInfo, cancellationToken); - messages.Add((messageInfo, nodeEventProcessor.GetMessageBatchState(messageInfo))); - await nodeEventProcessor.MarkMessageProcessedAsync(messageInfo, cancellationToken); - } + markEventsProcessedList.Add(token => nodeEventProcessor.MarkMessageProcessedAsync(eventMessage, token)); + }); getEventsInOrder = () => nodeEventProcessor.OrderProcessedByExecutorSequence(messages, item => item.Message); } var orderedMessageList = getEventsInOrder().ToList(); + if (!orderedMessageList.Any()) + { + return; + } + // Update TesTasks await OrchestrateTesTasksOnBatchAsync( "NodeEvent", @@ -205,20 +211,32 @@ await OrchestrateTesTasksOnBatchAsync( stoppingToken, "events"); + await Parallel.ForEachAsync(markEventsProcessedList, stoppingToken, async (markEventProcessed, cancellationToken) => + { + try + { + await markEventProcessed(cancellationToken); + } + catch (Exception ex) + { + logger.LogError(ex, @""); + } + }); + // Helpers async IAsyncEnumerable GetTesTasks([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { - foreach (var id in orderedMessageList.Select(t => t.Message.TesTaskId)) + foreach (var (id, @event) in orderedMessageList.Select(t => t.Message).Select(m => (m.TesTaskId, m.RunnerEventMessage.Name))) { TesTask tesTask = default; if (await repository.TryGetItemAsync(id, cancellationToken, task => tesTask = task) && tesTask is not null) { - logger.LogDebug("Completing event for task {TesTask}.", tesTask.Id); + logger.LogDebug("Completing event '{TaskEvent}' for task {TesTask}.", @event, tesTask.Id); yield return tesTask; } else { - logger.LogDebug("Could not find task {TesTask} for event.", id); + logger.LogDebug("Could not find task {TesTask} for event '{TaskEvent}'.", id, @event); yield return null; } } diff --git a/src/deploy-tes-on-azure/Deployer.cs b/src/deploy-tes-on-azure/Deployer.cs index 95c98d997..73076235b 100644 --- a/src/deploy-tes-on-azure/Deployer.cs +++ b/src/deploy-tes-on-azure/Deployer.cs @@ -83,7 +83,6 @@ public class Deployer public const string ConfigurationContainerName = "configuration"; public const string TesInternalContainerName = "tes-internal"; - public const string ContainersToMountFileName = "containers-to-mount"; public const string AllowedVmSizesFileName = "allowed-vm-sizes"; public const string TesCredentialsFileName = "TesCredentials.json"; public const string InputsContainerName = "inputs"; @@ -109,7 +108,7 @@ private record TesCredentials(string TesHostname, string TesUsername, string Tes "Microsoft.DBforPostgreSQL" }; - private readonly Dictionary> requiredResourceProviderFeatures = new Dictionary>() + private readonly Dictionary> requiredResourceProviderFeatures = new() { { "Microsoft.Compute", new List { "EncryptionAtHost" } } }; @@ -695,7 +694,7 @@ await Execute( if (exc is HttpRequestException rExc) { - ConsoleEx.WriteLine($"HTTP Request StatusCode: {rExc.StatusCode.ToString()}"); + ConsoleEx.WriteLine($"HTTP Request StatusCode: {rExc.StatusCode}"); if (rExc.InnerException is not null) { ConsoleEx.WriteLine($"InnerException: {rExc.InnerException.GetType().FullName}: {rExc.InnerException.Message}"); @@ -1449,39 +1448,9 @@ private async Task CreateDefaultStorageContainersAsync(IStorageAccount storageAc private Task WritePersonalizedFilesToStorageAccountAsync(IStorageAccount storageAccount, string managedIdentityName) => Execute( - $"Writing {ContainersToMountFileName} file to '{ConfigurationContainerName}' storage container...", + $"Writing {AllowedVmSizesFileName} file to '{TesInternalContainerName}' storage container...", async () => { - await UploadTextToStorageAccountAsync(storageAccount, ConfigurationContainerName, ContainersToMountFileName, Utility.PersonalizeContent(new Utility.ConfigReplaceTextItem[] - { - new("{DefaultStorageAccountName}", configuration.StorageAccountName), - new("{ManagedIdentityName}", managedIdentityName) - }, "scripts", ContainersToMountFileName)); - - // Configure Cromwell config file for Docker Mysql or PostgreSQL on Azure. - //if (configuration.ProvisionPostgreSqlOnAzure.GetValueOrDefault()) - //{ - // await UploadTextToStorageAccountAsync(storageAccount, ConfigurationContainerName, CromwellConfigurationFileName, Utility.PersonalizeContent(new Utility.ConfigReplaceTextItem[] - // { - // new("{DatabaseUrl}", $"\"jdbc:postgresql://{configuration.PostgreSqlServerName}.postgres.database.azure.com/{configuration.PostgreSqlCromwellDatabaseName}?sslmode=require\""), - // new("{DatabaseUser}", configuration.UsePostgreSqlSingleServer ? $"\"{configuration.PostgreSqlCromwellUserLogin}@{configuration.PostgreSqlServerName}\"": $"\"{configuration.PostgreSqlCromwellUserLogin}\""), - // new("{DatabasePassword}", $"\"{configuration.PostgreSqlCromwellUserPassword}\""), - // new("{DatabaseDriver}", $"\"org.postgresql.Driver\""), - // new("{DatabaseProfile}", "\"slick.jdbc.PostgresProfile$\""), - // }, "scripts", CromwellConfigurationFileName)); - //} - //else - //{ - // await UploadTextToStorageAccountAsync(storageAccount, ConfigurationContainerName, CromwellConfigurationFileName, Utility.PersonalizeContent(new Utility.ConfigReplaceTextItem[] - // { - // new("{DatabaseUrl}", $"\"jdbc:mysql://mysqldb/cromwell_db?useSSL=false&rewriteBatchedStatements=true&allowPublicKeyRetrieval=true\""), - // new("{DatabaseUser}", $"\"cromwell\""), - // new("{DatabasePassword}", $"\"cromwell\""), - // new("{DatabaseDriver}", $"\"com.mysql.cj.jdbc.Driver\""), - // new("{DatabaseProfile}", "\"slick.jdbc.MySQLProfile$\""), - // }, "scripts", CromwellConfigurationFileName)); - //} - await UploadTextToStorageAccountAsync(storageAccount, TesInternalContainerName, $"{ConfigurationContainerName}/{AllowedVmSizesFileName}", Utility.GetFileContent("scripts", AllowedVmSizesFileName)); }); @@ -1525,12 +1494,6 @@ await Execute( )); }); - //await Execute( - // $"Creating PostgreSQL cromwell database: {configuration.PostgreSqlCromwellDatabaseName}...", - // () => postgresManagementClient.Databases.CreateAsync( - // configuration.ResourceGroupName, configuration.PostgreSqlServerName, configuration.PostgreSqlCromwellDatabaseName, - // new())); - await Execute( $"Creating PostgreSQL tes database: {configuration.PostgreSqlTesDatabaseName}...", () => postgresManagementClient.Databases.CreateAsync( @@ -1580,12 +1543,6 @@ await postgreSqlDnsZone .ApplyAsync(); }); - //await Execute( - // $"Creating PostgreSQL cromwell database: {configuration.PostgreSqlCromwellDatabaseName}...", - // async () => await postgresManagementClient.Databases.CreateOrUpdateAsync( - // configuration.ResourceGroupName, configuration.PostgreSqlServerName, configuration.PostgreSqlCromwellDatabaseName, - // new())); - await Execute( $"Creating PostgreSQL tes database: {configuration.PostgreSqlTesDatabaseName}...", () => postgresManagementClient.Databases.CreateOrUpdateAsync( @@ -2199,7 +2156,7 @@ private Task UpdateVnetWithBatchSubnet() return (await updatedVnet.GetSubnetAsync(configuration.DefaultBatchSubnetName)).Value.Id.ToString(); }); - private void AddServiceEndpointsToSubnet(SubnetData subnet) + private static void AddServiceEndpointsToSubnet(SubnetData subnet) { subnet.ServiceEndpoints.Add(new ServiceEndpointProperties() { @@ -2222,6 +2179,7 @@ private void AddServiceEndpointsToSubnet(SubnetData subnet) }); } + // TODO: Remove? private async Task ValidateVmAsync() { var computeSkus = (await generalRetryPolicy.ExecuteAsync(ct => diff --git a/src/deploy-tes-on-azure/deploy-tes-on-azure.csproj b/src/deploy-tes-on-azure/deploy-tes-on-azure.csproj index 6c662bbee..83d081f27 100644 --- a/src/deploy-tes-on-azure/deploy-tes-on-azure.csproj +++ b/src/deploy-tes-on-azure/deploy-tes-on-azure.csproj @@ -58,7 +58,6 @@ - diff --git a/src/deploy-tes-on-azure/scripts/containers-to-mount b/src/deploy-tes-on-azure/scripts/containers-to-mount deleted file mode 100644 index 967c88fae..000000000 --- a/src/deploy-tes-on-azure/scripts/containers-to-mount +++ /dev/null @@ -1,32 +0,0 @@ -# This file lists blob containers that will be mounted to the Cromwell host, so they are available for WDL commands that access the files from the Cromwell host. - -# Accepted formats are: -# 1. Blob container URL with SAS, for cases where CoA VM does not have access to the storage account (containers from other Azure tenants). -# https://.blob.core.windows.net/? -# 2. //, where the identity '{ManagedIdentityName}' has 'Contributor' role on the storage account. - -# In both cases, the containers are mounted as // on the Cromwell host. -# Similarly, when specifying file paths in inputs.json files, use the same approach to refer to Azure blobs: /// -# You may also use http URIs for input files, but WDL commands that access the files from the Cromwell host will not be able to access them. - -# Wildcards are supported. For example: -# - to include all containers in an account, enter /mystorageaccount/* -# - to include all containers in all accounts, enter * -# Only those accounts that the identity '{ManagedIdentityName}' has 'Contributor' role on will actually be considered. - -# Exclusions are supported and processed after inclusions: -# - to exclude specific account or container, prefix the line with "-", for example -/mystorageaccount/private - -# Comments are allowed only at the beginning of the line. -# Changes in this file take effect after VM restart. - -# Required containers, don't delete these lines: - -/{DefaultStorageAccountName}/configuration -/{DefaultStorageAccountName}/cromwell-executions -/{DefaultStorageAccountName}/cromwell-workflow-logs - -# Optional containers, should list all containers where your input files are located: - -/{DefaultStorageAccountName}/inputs -https://datasettestinputs.blob.core.windows.net/dataset?sv=2018-03-28&sr=c&si=coa&sig=nKoK6dxjtk5172JZfDH116N6p3xTs7d%2Bs5EAUE4qqgM%3D \ No newline at end of file From 3a1382a92b4470acf1a4ef3af0582d0d553b3bc0 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 27 Oct 2023 13:06:55 -0700 Subject: [PATCH 058/202] formatting --- src/TesApi.Web/BatchScheduler.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index d8eeb67d5..89bac57ea 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -193,9 +193,9 @@ async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, Comb logger.LogDebug(@"Setting task {TesTask} with metadata {Metadata}.", tesTask.Id, newData); } - var(batchNodeMetrics, taskStartTime, taskEndTime, cromwellRcCode) = newTaskState == TesState.COMPLETEEnum - ? await GetBatchNodeMetricsAndCromwellResultCodeAsync(tesTask, cancellationToken) - : default; + var (batchNodeMetrics, taskStartTime, taskEndTime, cromwellRcCode) = newTaskState == TesState.COMPLETEEnum + ? await GetBatchNodeMetricsAndCromwellResultCodeAsync(tesTask, cancellationToken) + : default; lock (setTaskStateLock) { From 62de42e5eb364769f5c18e300621c47966c73c22 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 27 Oct 2023 15:43:04 -0700 Subject: [PATCH 059/202] Fix event bugs --- src/TesApi.Tests/BatchSchedulerTests.cs | 3 -- src/TesApi.Web/BatchScheduler.cs | 35 ++++++++++++++----- .../Events/RunnerEventsProcessor.cs | 22 +++++++++--- 3 files changed, 43 insertions(+), 17 deletions(-) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 143f16c45..ee05c6fbf 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -973,9 +973,6 @@ public async Task SuccessfullyCompletedTaskContainsBatchNodeMetrics() Assert.AreEqual(4, batchNodeMetrics.FileUploadSizeInGB); var taskLog = tesTask.GetOrAddTesTaskLog(); - var executorLog = taskLog.GetOrAddExecutorLog(); - Assert.IsNotNull(executorLog); - Assert.AreEqual(0, executorLog.ExitCode); Assert.AreEqual(DateTimeOffset.Parse("2020-10-08T02:30:39+00:00"), taskLog.StartTime); Assert.AreEqual(DateTimeOffset.Parse("2020-10-08T02:49:39+00:00"), taskLog.EndTime); }); diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 89bac57ea..ad6d443bb 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -182,7 +182,13 @@ public BatchScheduler( async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { { - var newData = System.Text.Json.JsonSerializer.Serialize(batchInfo, new System.Text.Json.JsonSerializerOptions { DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingDefault }); + var newData = System.Text.Json.JsonSerializer.Serialize( + batchInfo, + new System.Text.Json.JsonSerializerOptions() + { + DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingDefault, + Converters = { new System.Text.Json.Serialization.JsonStringEnumConverter(System.Text.Json.JsonNamingPolicy.CamelCase) } + }); if ("{}".Equals(newData) && newTaskState == tesTask.State) { @@ -195,7 +201,7 @@ async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, Comb var (batchNodeMetrics, taskStartTime, taskEndTime, cromwellRcCode) = newTaskState == TesState.COMPLETEEnum ? await GetBatchNodeMetricsAndCromwellResultCodeAsync(tesTask, cancellationToken) - : default; + : default; lock (setTaskStateLock) { @@ -204,14 +210,25 @@ async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, Comb var tesTaskLog = tesTask.GetOrAddTesTaskLog(); var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); + if (batchInfo.OutputFileLogs is not null && tesTaskLog.Outputs is not null) + { + logger.LogCritical("Why is tesTaskLog.Outputs already set?"); + } + tesTaskLog.BatchNodeMetrics = batchNodeMetrics; tesTaskLog.CromwellResultCode = cromwellRcCode; tesTaskLog.EndTime ??= taskEndTime ?? batchInfo.BatchTaskEndTime; tesTaskLog.StartTime ??= taskStartTime ?? batchInfo.BatchTaskStartTime; - tesTaskLog.Outputs ??= batchInfo.OutputFileLogs?.Select(entry => new Tes.Models.TesOutputFileLog { Path = entry.Path, SizeBytes = $"{entry.Size}", Url = entry.Url.AbsoluteUri }).ToList(); - tesTaskExecutorLog.StartTime ??= batchInfo.ExecutorEndTime; + tesTaskLog.Outputs ??= batchInfo.OutputFileLogs?.Select( + entry => new Tes.Models.TesOutputFileLog + { + Path = entry.Path, + SizeBytes = $"{entry.Size}", + Url = entry.Url.AbsoluteUri + }).ToList(); + tesTaskExecutorLog.StartTime ??= batchInfo.ExecutorStartTime; tesTaskExecutorLog.EndTime ??= batchInfo.ExecutorEndTime; - tesTaskExecutorLog.ExitCode ??= batchInfo.BatchTaskExitCode; + tesTaskExecutorLog.ExitCode ??= batchInfo.ExecutorExitCode; // Only accurate when the task completes successfully, otherwise it's the Batch time as reported from Batch // TODO this could get large; why? @@ -311,7 +328,7 @@ Task HandleInfoUpdate(TesTask tesTask, CombinedBatchTaskInfo batchInfo, Ca //new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.MissingBatchTask, alternateSystemLogItem: null, (tesTask, _, ct) => AddBatchTaskAsync(tesTask, ct)), new TesTaskStateTransition(tesTaskIsQueued, AzureBatchTaskState.TaskState.Initializing, alternateSystemLogItem: null, (tesTask, _) => { tesTask.State = TesState.INITIALIZINGEnum; return true; }), new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, AzureBatchTaskState.TaskState.NodeAllocationFailed, alternateSystemLogItem: null, RequeueTaskAfterFailureAsync), - new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, AzureBatchTaskState.TaskState.Running, alternateSystemLogItem: null, (tesTask, _) => { tesTask.State = TesState.RUNNINGEnum; return true; }), + new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.Running, alternateSystemLogItem: null, (tesTask, info, ct) => SetTaskStateAndLog(tesTask, TesState.RUNNINGEnum, info, ct)), //new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.MoreThanOneActiveJobOrTaskFound, BatchTaskState.MoreThanOneActiveJobOrTaskFound.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedSuccessfully, alternateSystemLogItem: null, SetTaskCompleted), new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedWithErrors, "Please open an issue. There should have been an error reported here.", SetTaskExecutorError), @@ -321,7 +338,8 @@ Task HandleInfoUpdate(TesTask tesTask, CombinedBatchTaskInfo batchInfo, Ca //new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.JobNotFound, BatchTaskState.JobNotFound.ToString(), SetTaskSystemError), //new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.MissingBatchTask, BatchTaskState.MissingBatchTask.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodePreempted, alternateSystemLogItem: null, HandlePreemptedNodeAsync), - new TesTaskStateTransition(condition: null, AzureBatchTaskState.TaskState.InfoUpdate, alternateSystemLogItem: null, HandleInfoUpdate) + new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed, alternateSystemLogItem: null, HandleInfoUpdate), + new TesTaskStateTransition(condition: null, AzureBatchTaskState.TaskState.InfoUpdate, alternateSystemLogItem: null, HandleInfoUpdate), }.AsReadOnly(); } @@ -1456,8 +1474,7 @@ public async ValueTask ActionAsync(TesTask tesTask, AzureBatchTaskState ba if (Action is not null) { - Action(tesTask, combinedBatchTaskInfo); - tesTaskChanged = true; + tesTaskChanged = Action(tesTask, combinedBatchTaskInfo); } return tesTaskChanged; diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index 37c33da88..a0458ca5e 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -183,7 +183,7 @@ public AzureBatchTaskState GetMessageBatchState(RunnerEventsMessage message) _logger.LogDebug("Getting batch task state from event {EventName} for {TesTask}.", nodeMessage.Name ?? message.Event, nodeMessage.EntityId); return (nodeMessage.Name ?? message.Event) switch { - Tes.Runner.Events.EventsPublisher.DownloadStartEvent => new(AzureBatchTaskState.TaskState.NoChange, + Tes.Runner.Events.EventsPublisher.DownloadStartEvent => new(AzureBatchTaskState.TaskState.InfoUpdate, BatchTaskStartTime: nodeMessage.Created), Tes.Runner.Events.EventsPublisher.DownloadEndEvent => nodeMessage.StatusMessage switch @@ -271,10 +271,22 @@ public AzureBatchTaskState GetMessageBatchState(RunnerEventsMessage message) var numberOfFiles = int.Parse(eventData["numberOfFiles"]); for (var i = 0; i < numberOfFiles; ++i) { - yield return new( - new Uri(eventData[$"fileUri-{i}"]), - eventData[$"filePath-{i}"], - long.Parse(eventData[$"fileSize-{i}"])); + var nodePath = eventData[$"filePath-{i}"]; + var idxStart = nodePath.IndexOf("/wd/"); + + if (idxStart > 0) + { + var containerPathUnderRoot = nodePath[(idxStart + 1)..]; + var idxDirectory = containerPathUnderRoot.IndexOf('/'); + + if (idxDirectory > 0) + { + yield return new( + new Azure.Storage.Blobs.BlobUriBuilder(new Uri(eventData[$"fileUri-{i}"])) { Sas = null, Query = null }.ToUri(), + $"/{containerPathUnderRoot}", + long.Parse(eventData[$"fileSize-{i}"])); + } + } } } } From ba910913f2547e83b7f249b357c407749eb86ea3 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 27 Oct 2023 18:36:00 -0700 Subject: [PATCH 060/202] fix TesOutputFileLog --- src/TesApi.Web/BatchScheduler.cs | 17 ++++++++--------- src/TesApi.Web/Events/RunnerEventsProcessor.cs | 6 ++++-- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index ad6d443bb..d991d5c7c 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -210,22 +210,21 @@ async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, Comb var tesTaskLog = tesTask.GetOrAddTesTaskLog(); var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); - if (batchInfo.OutputFileLogs is not null && tesTaskLog.Outputs is not null) + if (tesTaskLog.Outputs is not null && !(batchInfo.OutputFileLogs?.Any() ?? true)) { - logger.LogCritical("Why is tesTaskLog.Outputs already set?"); + tesTaskLog.Outputs = batchInfo.OutputFileLogs?.Select( + entry => new Tes.Models.TesOutputFileLog + { + Path = entry.Path, + SizeBytes = $"{entry.Size}", + Url = entry.Url.AbsoluteUri + }).ToList(); } tesTaskLog.BatchNodeMetrics = batchNodeMetrics; tesTaskLog.CromwellResultCode = cromwellRcCode; tesTaskLog.EndTime ??= taskEndTime ?? batchInfo.BatchTaskEndTime; tesTaskLog.StartTime ??= taskStartTime ?? batchInfo.BatchTaskStartTime; - tesTaskLog.Outputs ??= batchInfo.OutputFileLogs?.Select( - entry => new Tes.Models.TesOutputFileLog - { - Path = entry.Path, - SizeBytes = $"{entry.Size}", - Url = entry.Url.AbsoluteUri - }).ToList(); tesTaskExecutorLog.StartTime ??= batchInfo.ExecutorStartTime; tesTaskExecutorLog.EndTime ??= batchInfo.ExecutorEndTime; tesTaskExecutorLog.ExitCode ??= batchInfo.ExecutorExitCode; diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index a0458ca5e..358852522 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -263,6 +263,8 @@ public AzureBatchTaskState GetMessageBatchState(RunnerEventsMessage message) static IEnumerable GetFileLogs(IDictionary eventData) { + const string marker = "/wd/"; + if (eventData is null) { yield break; @@ -272,11 +274,11 @@ public AzureBatchTaskState GetMessageBatchState(RunnerEventsMessage message) for (var i = 0; i < numberOfFiles; ++i) { var nodePath = eventData[$"filePath-{i}"]; - var idxStart = nodePath.IndexOf("/wd/"); + var idxStart = nodePath.IndexOf(marker); if (idxStart > 0) { - var containerPathUnderRoot = nodePath[(idxStart + 1)..]; + var containerPathUnderRoot = nodePath[(idxStart + marker.Length)..]; var idxDirectory = containerPathUnderRoot.IndexOf('/'); if (idxDirectory > 0) From 22b2d8cadd779718ac65e1e13f3db75cb7b35272 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 27 Oct 2023 18:36:48 -0700 Subject: [PATCH 061/202] Gather information to better detect tasks affected by node failures --- src/TesApi.Web/BatchPool.cs | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 8857090b9..5d033aa78 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -34,6 +34,7 @@ public sealed partial class BatchPool private readonly ILogger _logger; private readonly IAzureProxy _azureProxy; + private readonly Storage.IStorageAccessProvider _storageAccessProvider; /// /// Constructor of . @@ -42,9 +43,11 @@ public sealed partial class BatchPool /// /// /// + /// /// - public BatchPool(IBatchScheduler batchScheduler, IOptions batchSchedulingOptions, IAzureProxy azureProxy, ILogger logger) + public BatchPool(IBatchScheduler batchScheduler, IOptions batchSchedulingOptions, IAzureProxy azureProxy, ILogger logger, Storage.IStorageAccessProvider storageAccessProvider) { + _storageAccessProvider = storageAccessProvider; var rotationDays = batchSchedulingOptions.Value.PoolRotationForcedDays; if (rotationDays == 0) { rotationDays = Options.BatchSchedulingOptions.DefaultPoolRotationForcedDays; } _forcePoolRotationAge = TimeSpan.FromDays(rotationDays); @@ -342,6 +345,23 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc { var nodesToRemove = Enumerable.Empty(); + async Task SendNodeErrorData(string nodeId, IReadOnlyList content) + { + var url = await _storageAccessProvider.GetInternalTesBlobUrlAsync( + $"nodeError/{nodeId}-{new Guid():B}", + Azure.Storage.Sas.BlobSasPermissions.Create, + cancellationToken); + await _azureProxy.UploadBlobAsync( + new(url), + System.Text.Json.JsonSerializer.Serialize(content, + new System.Text.Json.JsonSerializerOptions() + { + DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingDefault, + Converters = { new System.Text.Json.Serialization.JsonStringEnumConverter(System.Text.Json.JsonNamingPolicy.CamelCase) } + }), + cancellationToken); + } + // It's documented that a max of 100 nodes can be removed at a time. Excess eligible nodes will be removed in a future call to this method. await foreach (var node in GetNodesToRemove(true).Take(MaxComputeNodesToRemoveAtOnce).WithCancellation(cancellationToken)) { @@ -349,6 +369,8 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc { case ComputeNodeState.Unusable: _logger.LogDebug("Found unusable node {NodeId}", node.Id); + await SendNodeErrorData(node.Id, node.RecentTasks); + //node.RecentTasks[0].ExecutionInformation.FailureInformation.Code == TaskFailureInformationCodes.DiskFull // TODO: notify running tasks that task will switch nodes? break; @@ -359,6 +381,9 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc case ComputeNodeState.Preempted: _logger.LogDebug("Found preempted node {NodeId}", node.Id); + await SendNodeErrorData(node.Id, node.RecentTasks); + //node.RecentTasks[0].TaskId + //node.RecentTasks[0].ExecutionInformation.FailureInformation.Category == ErrorCategory.ServerError // TODO: notify running tasks that task will switch nodes? Or, in the future, terminate the task? break; @@ -420,7 +445,7 @@ async ValueTask GetTaskCountAsync(int @default) // Used to make reenabling } IAsyncEnumerable GetNodesToRemove(bool withState) - => _azureProxy.ListComputeNodesAsync(Id, new ODATADetailLevel(filterClause: @"state eq 'starttaskfailed' or state eq 'preempted' or state eq 'unusable'", selectClause: withState ? @"id,state,startTaskInfo" : @"id")); + => _azureProxy.ListComputeNodesAsync(Id, new ODATADetailLevel(filterClause: @"state eq 'starttaskfailed' or state eq 'preempted' or state eq 'unusable'", selectClause: withState ? @"id,recentTasks,state,startTaskInfo" : @"id")); } private bool DetermineIsAvailable(DateTime? creation) From df5b4378cc3c7094ee3c5a97487ef41c9bf75911 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 27 Oct 2023 19:52:52 -0700 Subject: [PATCH 062/202] YA attempt at output file log --- src/TesApi.Web/BatchScheduler.cs | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index d991d5c7c..f07263854 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -210,17 +210,6 @@ async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, Comb var tesTaskLog = tesTask.GetOrAddTesTaskLog(); var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); - if (tesTaskLog.Outputs is not null && !(batchInfo.OutputFileLogs?.Any() ?? true)) - { - tesTaskLog.Outputs = batchInfo.OutputFileLogs?.Select( - entry => new Tes.Models.TesOutputFileLog - { - Path = entry.Path, - SizeBytes = $"{entry.Size}", - Url = entry.Url.AbsoluteUri - }).ToList(); - } - tesTaskLog.BatchNodeMetrics = batchNodeMetrics; tesTaskLog.CromwellResultCode = cromwellRcCode; tesTaskLog.EndTime ??= taskEndTime ?? batchInfo.BatchTaskEndTime; @@ -229,6 +218,15 @@ async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, Comb tesTaskExecutorLog.EndTime ??= batchInfo.ExecutorEndTime; tesTaskExecutorLog.ExitCode ??= batchInfo.ExecutorExitCode; + if (tesTaskLog.Outputs is null) + { + tesTaskLog.Outputs = batchInfo.OutputFileLogs?.Select(ConvertOutputFileLogToTesOutputFileLog).ToList(); + } + else if (!tesTaskLog.Outputs.Any()) + { + tesTaskLog.Outputs.AddRange(batchInfo.OutputFileLogs?.Select(ConvertOutputFileLogToTesOutputFileLog) ?? Enumerable.Empty()); + } + // Only accurate when the task completes successfully, otherwise it's the Batch time as reported from Batch // TODO this could get large; why? //var timefromCoAScriptCompletionToBatchTaskDetectedComplete = tesTaskLog.EndTime - tesTaskExecutorLog.EndTime; @@ -271,6 +269,16 @@ async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, Comb } return true; + + Tes.Models.TesOutputFileLog ConvertOutputFileLogToTesOutputFileLog(AzureBatchTaskState.OutputFileLog fileLog) + { + return new Tes.Models.TesOutputFileLog + { + Path = fileLog.Path, + SizeBytes = $"{fileLog.Size}", + Url = fileLog.Url.AbsoluteUri + }; + } } async Task SetTaskCompleted(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) From d61138321df93260a842905013310d4339f93ea4 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 30 Oct 2023 16:45:20 -0700 Subject: [PATCH 063/202] Add URL to failure logs and code cleanup --- src/TesApi.Web/AzureBatchTaskState.cs | 51 ++++--- src/TesApi.Web/BatchScheduler.cs | 6 +- src/TesApi.Web/Events/RunnerEventsMessage.cs | 4 +- .../Events/RunnerEventsProcessor.cs | 143 +++++++++++++----- src/TesApi.Web/Scheduler.cs | 61 +++++--- 5 files changed, 187 insertions(+), 78 deletions(-) diff --git a/src/TesApi.Web/AzureBatchTaskState.cs b/src/TesApi.Web/AzureBatchTaskState.cs index 459e039e8..b7be1a3d5 100644 --- a/src/TesApi.Web/AzureBatchTaskState.cs +++ b/src/TesApi.Web/AzureBatchTaskState.cs @@ -4,14 +4,15 @@ using System; using System.Collections.Generic; using System.Linq; +using Tes.Models; using static TesApi.Web.AzureBatchTaskState; namespace TesApi.Web { /// - /// Combined state of an attempt to run a + /// Combined state of the progression of an attempt to run a /// - /// Task state. See . + /// Task state. Also see . /// File details after the task has completed successfully, for logging purposes. /// Failure information. /// . @@ -22,10 +23,10 @@ namespace TesApi.Web /// . /// . /// Warning. First item in enumeration is the Warning code, rest of items are additional system log entries. - public record AzureBatchTaskState( + public record class AzureBatchTaskState( TaskState State, IEnumerable OutputFileLogs = default, - FailureInformation Failure = default, + FailureInformation? Failure = default, DateTimeOffset? CloudTaskCreationTime = default, DateTimeOffset? BatchTaskStartTime = default, DateTimeOffset? ExecutorStartTime = default, @@ -41,12 +42,12 @@ public record AzureBatchTaskState( public enum TaskState { /// - /// The event does not represent any change in task state. + /// The event does not represent any change in the task's state. /// NoChange, /// - /// The event provides metadata without changing the task's state. + /// The event provides task state without changing the task's . /// InfoUpdate, @@ -112,28 +113,44 @@ public enum TaskState /// URL of the file in storage, e.g. s3://bucket/file.txt /// Path of the file inside the container. Must be an absolute path. /// Size of the file in bytes. - public record OutputFileLog(Uri Url, string Path, long Size); + public record struct OutputFileLog(Uri Url, string Path, long Size); /// /// TesTask's failure information /// - /// Failure code. Intended to be machine readable. See . - /// Failure details to be added to . - public record FailureInformation(string Reason, IEnumerable SystemLogs); + /// Failure code. Intended to be machine readable. See . + /// Failure details to be added to . + public record struct FailureInformation(string Reason, IEnumerable SystemLogs) + { + + /// + /// Failure details to be added to . + /// + public IEnumerable SystemLogs { get; private set; } = SystemLogs; + + /// + /// Adds additional logs to . + /// + /// Additional logs to add. + public void AppendRangeToSystemLogs(IEnumerable additionalLogs) + { + SystemLogs = SystemLogs.Concat(additionalLogs); + } + } /// - /// SystemLog appending constructor + /// SystemLog-appending copy constructor /// - /// + /// /// - protected AzureBatchTaskState(AzureBatchTaskState other, string appendToSystemLog) - : this(other) + protected AzureBatchTaskState(AzureBatchTaskState original, string appendToSystemLog) + : this(original) { - Failure = other.Failure switch + Failure = original.Failure switch { null => new("UnknownError", Enumerable.Empty().Append(appendToSystemLog)), - { SystemLogs: null } => new(other.Failure.Reason ?? "UnknownError", Enumerable.Empty().Append(appendToSystemLog)), - _ => new(other.Failure.Reason ?? "UnknownError", other.Failure.SystemLogs.Append(appendToSystemLog)), + { SystemLogs: null } => new(original.Failure?.Reason ?? "UnknownError", Enumerable.Empty().Append(appendToSystemLog)), + _ => new(original.Failure?.Reason ?? "UnknownError", original.Failure?.SystemLogs.Append(appendToSystemLog)), }; } } diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index f07263854..0b72d9724 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -249,11 +249,11 @@ async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, Comb if (batchInfo.Failure is not null) { - tesTask.SetFailureReason(batchInfo.Failure.Reason); + tesTask.SetFailureReason(batchInfo.Failure?.Reason); - if (batchInfo.Failure.SystemLogs is not null) + if (batchInfo.Failure?.SystemLogs is not null) { - tesTask.AddToSystemLog(batchInfo.Failure.SystemLogs); + tesTask.AddToSystemLog(batchInfo.Failure?.SystemLogs); } else if (!string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem)) { diff --git a/src/TesApi.Web/Events/RunnerEventsMessage.cs b/src/TesApi.Web/Events/RunnerEventsMessage.cs index 356260d32..a3062c06d 100644 --- a/src/TesApi.Web/Events/RunnerEventsMessage.cs +++ b/src/TesApi.Web/Events/RunnerEventsMessage.cs @@ -12,7 +12,7 @@ namespace TesApi.Web.Events /// URL of the event message. /// Tags on the event message blob. /// Name of the event based on parsing the blob's BlobName. - public record class RunnerEventsMessage(Uri BlobUri, IDictionary Tags, string Event) + public record struct RunnerEventsMessage(Uri BlobUri, IDictionary Tags, string Event) { /// /// @@ -22,7 +22,7 @@ public record class RunnerEventsMessage(Uri BlobUri, IDictionary /// /// /// - public string TesTaskId => RunnerEventMessage?.EntityId; + public readonly string TesTaskId => RunnerEventMessage?.EntityId; /// /// Sets . diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index 358852522..6f9dba9ee 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -20,20 +20,24 @@ public class RunnerEventsProcessor /// public const string ProcessedTag = "processed"; - private readonly IAzureProxy _azureProxy; - private readonly ILogger _logger; + private readonly IAzureProxy azureProxy; + private readonly Storage.IStorageAccessProvider storageAccessProvider; + private readonly ILogger logger; /// /// Constructor of . /// /// + /// /// - public RunnerEventsProcessor(IAzureProxy azureProxy, ILogger logger) + public RunnerEventsProcessor(IAzureProxy azureProxy, Storage.IStorageAccessProvider storageAccessProvider, ILogger logger) { ArgumentNullException.ThrowIfNull(azureProxy); + ArgumentNullException.ThrowIfNull(storageAccessProvider); - _azureProxy = azureProxy; - _logger = logger; + this.azureProxy = azureProxy; + this.storageAccessProvider = storageAccessProvider; + this.logger = logger; } @@ -71,6 +75,11 @@ public void ValidateMessageMetadata(RunnerEventsMessage message) throw new ArgumentException("This message was already processed.", nameof(message)); } + if (!message.Tags.ContainsKey("event-name") || !message.Tags.ContainsKey("task-id")) + { + throw new ArgumentException("This message is missing needed tags.", nameof(message)); + } + // There are up to 10 tags allowed. We will be adding one. // https://learn.microsoft.com/azure/storage/blobs/storage-manage-find-blobs?tabs=azure-portal#setting-blob-index-tags if (message.Tags.Count > 9) @@ -91,7 +100,7 @@ public async Task DownloadAndValidateMessageContentAsync(RunnerEventsMessage mes try { - var messageText = await _azureProxy.DownloadBlobAsync(message.BlobUri, cancellationToken); + var messageText = await azureProxy.DownloadBlobAsync(message.BlobUri, cancellationToken); result = System.Text.Json.JsonSerializer.Deserialize(messageText) ?? throw new InvalidOperationException("Deserialize() returned null."); } @@ -100,61 +109,87 @@ public async Task DownloadAndValidateMessageContentAsync(RunnerEventsMessage mes throw new InvalidOperationException($"Event message blob is malformed. {ex.GetType().FullName}:{ex.Message}", ex); } - System.Diagnostics.Debug.Assert(Guid.TryParse(result.Id, out _)); - System.Diagnostics.Debug.Assert(Tes.Runner.Events.EventsPublisher.EventVersion.Equals(result.EventVersion, StringComparison.Ordinal)); - System.Diagnostics.Debug.Assert(Tes.Runner.Events.EventsPublisher.EventDataVersion.Equals(result.EventDataVersion, StringComparison.Ordinal)); - System.Diagnostics.Debug.Assert(Tes.Runner.Events.EventsPublisher.TesTaskRunnerEntityType.Equals(result.EntityType, StringComparison.Ordinal)); - System.Diagnostics.Debug.Assert(message.Event.Equals(result.Name, StringComparison.Ordinal)); + Assert(Guid.TryParse(result.Id, out _), + $"{nameof(result.Id)}('{result.Id}') is malformed."); + Assert(Tes.Runner.Events.EventsPublisher.EventVersion.Equals(result.EventVersion, StringComparison.Ordinal), + $"{nameof(result.EventVersion)}('{result.EventVersion}') is not recognized."); + Assert(Tes.Runner.Events.EventsPublisher.EventDataVersion.Equals(result.EventDataVersion, StringComparison.Ordinal), + $"{nameof(result.EventDataVersion)}('{result.EventDataVersion}') is not recognized."); + Assert(Tes.Runner.Events.EventsPublisher.TesTaskRunnerEntityType.Equals(result.EntityType, StringComparison.Ordinal), + $"{nameof(result.EntityType)}('{result.EntityType}') is not recognized."); + + Assert(message.TesTaskId.Equals(result.EntityId, StringComparison.Ordinal), + $"{nameof(result.EntityId)}('{result.EntityId}') does not match the expected value of '{message.TesTaskId}'."); + Assert(result.EntityId.Equals(message.Tags["task-id"], StringComparison.Ordinal), + $"{nameof(result.Name)}('{result.EntityId}') does not match the expected value of '{message.Tags["task-id"]}' from the tags.."); + Assert(message.Event.Equals(result.Name, StringComparison.OrdinalIgnoreCase), + $"{nameof(result.Name)}('{result.Name}') does not match the expected value of '{message.Event}' from the blob path."); + Assert(result.Name.Equals(message.Tags["event-name"], StringComparison.Ordinal), + $"{nameof(result.Name)}('{result.Name}') does not match the expected value of '{message.Tags["event-name"]}' from the tags."); // Event type specific validations switch (result.Name) { case Tes.Runner.Events.EventsPublisher.DownloadStartEvent: - System.Diagnostics.Debug.Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(result.StatusMessage, StringComparison.Ordinal)); + Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(result.StatusMessage, StringComparison.Ordinal), + $"{nameof(result.StatusMessage)}('{result.StatusMessage}') does not match the expected value of '{Tes.Runner.Events.EventsPublisher.StartedStatus}'."); break; case Tes.Runner.Events.EventsPublisher.DownloadEndEvent: - System.Diagnostics.Debug.Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage)); + Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage), + $"{nameof(result.StatusMessage)}('{result.StatusMessage}') does not match one of the expected valued of '{Tes.Runner.Events.EventsPublisher.SuccessStatus}' or '{Tes.Runner.Events.EventsPublisher.FailedStatus}'."); break; case Tes.Runner.Events.EventsPublisher.UploadStartEvent: - System.Diagnostics.Debug.Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(result.StatusMessage, StringComparison.Ordinal)); + Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(result.StatusMessage, StringComparison.Ordinal), + $"{nameof(result.StatusMessage)}('{result.StatusMessage}') does not match the expected value of '{Tes.Runner.Events.EventsPublisher.StartedStatus}'."); break; case Tes.Runner.Events.EventsPublisher.UploadEndEvent: - System.Diagnostics.Debug.Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage)); + Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage), + $"{nameof(result.StatusMessage)}('{result.StatusMessage}') does not match one of the expected valued of '{Tes.Runner.Events.EventsPublisher.SuccessStatus}' or '{Tes.Runner.Events.EventsPublisher.FailedStatus}'."); break; case Tes.Runner.Events.EventsPublisher.ExecutorStartEvent: - System.Diagnostics.Debug.Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(result.StatusMessage, StringComparison.Ordinal)); + Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(result.StatusMessage, StringComparison.Ordinal), + $"{nameof(result.StatusMessage)}('{result.StatusMessage}') does not match the expected value of '{Tes.Runner.Events.EventsPublisher.StartedStatus}'."); break; case Tes.Runner.Events.EventsPublisher.ExecutorEndEvent: - System.Diagnostics.Debug.Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage)); + Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage), + $"{nameof(result.StatusMessage)}('{result.StatusMessage}') does not match one of the expected valued of '{Tes.Runner.Events.EventsPublisher.SuccessStatus}' or '{Tes.Runner.Events.EventsPublisher.FailedStatus}'."); break; case Tes.Runner.Events.EventsPublisher.TaskCompletionEvent: - System.Diagnostics.Debug.Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage)); + Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage), + $"{nameof(result.StatusMessage)}('{result.StatusMessage}') does not match one of the expected valued of '{Tes.Runner.Events.EventsPublisher.SuccessStatus}' or '{Tes.Runner.Events.EventsPublisher.FailedStatus}'."); break; default: - System.Diagnostics.Debug.Assert(false); + Assert(false, $"{nameof(result.Name)}('{result.Name}') is not recognized."); break; } message.SetRunnerEventMessage(result); + + static void Assert([System.Diagnostics.CodeAnalysis.DoesNotReturnIf(false)] bool condition, string message) + { + if (!condition) + { + throw new InvalidOperationException(message); + } + } } - private enum EventsInOrder + private readonly IReadOnlyDictionary EventsInOrder = new Dictionary(StringComparer.OrdinalIgnoreCase) { - downloadStart, - downloadEnd, - executorStart, - executorEnd, - uploadStart, - uploadEnd, - taskCompleted, - } + { Tes.Runner.Events.EventsPublisher.DownloadStartEvent, 20 }, + { Tes.Runner.Events.EventsPublisher.DownloadEndEvent, 30 }, + { Tes.Runner.Events.EventsPublisher.ExecutorStartEvent, 40 }, + { Tes.Runner.Events.EventsPublisher.ExecutorEndEvent, 50 }, + { Tes.Runner.Events.EventsPublisher.UploadStartEvent, 60 }, + { Tes.Runner.Events.EventsPublisher.UploadEndEvent, 70 }, + }.AsReadOnly(); /// /// Returns a sequence in the order the events were produced. @@ -165,25 +200,27 @@ private enum EventsInOrder /// public IEnumerable OrderProcessedByExecutorSequence(IEnumerable source, Func messageGetter) { - return source.OrderBy(t => messageGetter(t).RunnerEventMessage.Created).ThenBy(t => Enum.TryParse(typeof(EventsInOrder), messageGetter(t).RunnerEventMessage.Name, true, out var result) ? result : -1); + return source.OrderBy(t => messageGetter(t).RunnerEventMessage.Created).ThenBy(t => EventsInOrder.TryGetValue(messageGetter(t).RunnerEventMessage.Name, out var result) ? result : -1); } /// /// Gets the task status details from this event message. /// /// + /// + /// /// - public AzureBatchTaskState GetMessageBatchState(RunnerEventsMessage message) + public async Task GetMessageBatchStateAsync(RunnerEventsMessage message, Tes.Models.TesTask tesTask, CancellationToken cancellationToken) { ArgumentNullException.ThrowIfNull(message); ArgumentNullException.ThrowIfNull(message.RunnerEventMessage, nameof(message)); var nodeMessage = message.RunnerEventMessage; + logger.LogDebug("Getting batch task state from event {EventName} for {TesTask}.", nodeMessage.Name ?? message.Event, nodeMessage.EntityId); - _logger.LogDebug("Getting batch task state from event {EventName} for {TesTask}.", nodeMessage.Name ?? message.Event, nodeMessage.EntityId); - return (nodeMessage.Name ?? message.Event) switch + var state = (nodeMessage.Name ?? message.Event) switch { - Tes.Runner.Events.EventsPublisher.DownloadStartEvent => new(AzureBatchTaskState.TaskState.InfoUpdate, + Tes.Runner.Events.EventsPublisher.DownloadStartEvent => new AzureBatchTaskState(AzureBatchTaskState.TaskState.InfoUpdate, BatchTaskStartTime: nodeMessage.Created), Tes.Runner.Events.EventsPublisher.DownloadEndEvent => nodeMessage.StatusMessage switch @@ -261,6 +298,17 @@ public AzureBatchTaskState GetMessageBatchState(RunnerEventsMessage message) _ => throw new System.Diagnostics.UnreachableException(), }; + var processLogs = await GetProcessLogs(nodeMessage, tesTask, cancellationToken).ToListAsync(cancellationToken); + + if (processLogs.Any()) + { + processLogs.Insert(0, "Possibly relevant logs:"); + state.Failure?.AppendRangeToSystemLogs(processLogs); + } + + return state; + + // Helpers static IEnumerable GetFileLogs(IDictionary eventData) { const string marker = "/wd/"; @@ -291,6 +339,33 @@ public AzureBatchTaskState GetMessageBatchState(RunnerEventsMessage message) } } } + + async IAsyncEnumerable GetProcessLogs(Tes.Runner.Events.EventMessage message, Tes.Models.TesTask tesTask, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + { + var blobNameStartsWith = message.Name switch + { + Tes.Runner.Events.EventsPublisher.DownloadEndEvent => "download_std", + Tes.Runner.Events.EventsPublisher.ExecutorEndEvent => "exec_std", + Tes.Runner.Events.EventsPublisher.UploadEndEvent => "upload_std", + _ => string.Empty, + }; + + if (string.IsNullOrEmpty(blobNameStartsWith)) + { + yield break; + } + + var listUri = await storageAccessProvider.GetInternalTesTaskBlobUrlAsync(tesTask, string.Empty, Azure.Storage.Sas.BlobSasPermissions.List, cancellationToken); + + await foreach(var uri in azureProxy.ListBlobsAsync(new(listUri), cancellationToken) + .Where(blob => blob.BlobName.EndsWith(".txt") && System.IO.Path.GetFileName(blob.BlobName).StartsWith(blobNameStartsWith)) + .OrderBy(blob => blob.BlobName) + .Select(blob => blob.BlobUri) + .WithCancellation(cancellationToken)) + { + yield return uri.AbsoluteUri; + } + } } /// @@ -301,7 +376,7 @@ public AzureBatchTaskState GetMessageBatchState(RunnerEventsMessage message) /// public async Task MarkMessageProcessedAsync(RunnerEventsMessage message, CancellationToken cancellationToken) { - await _azureProxy.SetBlobTags( + await azureProxy.SetBlobTags( message.BlobUri, message.Tags .Append(new(ProcessedTag, DateTime.UtcNow.ToString("O"))) diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index c56decb57..0329edd05 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -163,37 +163,67 @@ private Task ExecuteUpdateTesTaskFromEventBlobAsync(CancellationToken stoppingTo async ValueTask UpdateTesTasksFromEventBlobsAsync(CancellationToken stoppingToken) { var markEventsProcessedList = new ConcurrentBag>(); - Func> getEventsInOrder; + Func> getEventsInOrder; { - var messages = new ConcurrentBag<(RunnerEventsMessage Message, AzureBatchTaskState State)>(); + var messages = new ConcurrentBag<(RunnerEventsMessage Message, TesTask Task, AzureBatchTaskState State)>(); // Get and parse event blobs await Parallel.ForEachAsync(batchScheduler.GetEventMessagesAsync(stoppingToken), stoppingToken, async (eventMessage, cancellationToken) => { + var tesTask = await GetTesTaskAsync(eventMessage.Tags["task-id"], eventMessage.Tags["event-name"]); + + if (tesTask is null) + { + return; + } + try { nodeEventProcessor.ValidateMessageMetadata(eventMessage); await nodeEventProcessor.DownloadAndValidateMessageContentAsync(eventMessage, cancellationToken); - messages.Add((eventMessage, nodeEventProcessor.GetMessageBatchState(eventMessage))); + messages.Add((eventMessage, tesTask, await nodeEventProcessor.GetMessageBatchStateAsync(eventMessage, tesTask, cancellationToken))); } catch (Exception ex) { - logger.LogError(ex, @""); - messages.Add((eventMessage, new(AzureBatchTaskState.TaskState.InfoUpdate, Warning: new List + logger.LogError(ex, @"Downloading and parsing event failed: {ErrorMessage}", ex.Message); + messages.Add((eventMessage, tesTask, new(AzureBatchTaskState.TaskState.InfoUpdate, Warning: new List { "EventParsingFailed", $"{ex.GetType().FullName}: {ex.Message}", }))); + + if (ex is System.Diagnostics.UnreachableException) // Don't retry this event. + { + markEventsProcessedList.Add(token => nodeEventProcessor.MarkMessageProcessedAsync(eventMessage, token)); + } + return; } markEventsProcessedList.Add(token => nodeEventProcessor.MarkMessageProcessedAsync(eventMessage, token)); + + // Helpers + async ValueTask GetTesTaskAsync(string id, string @event) + { + TesTask tesTask = default; + if (await repository.TryGetItemAsync(id, cancellationToken, task => tesTask = task) && tesTask is not null) + { + logger.LogDebug("Completing event '{TaskEvent}' for task {TesTask}.", @event, tesTask.Id); + return tesTask; + } + else + { + logger.LogDebug("Could not find task {TesTask} for event '{TaskEvent}'.", id, @event); + return null; + } + } }); - getEventsInOrder = () => nodeEventProcessor.OrderProcessedByExecutorSequence(messages, item => item.Message); + getEventsInOrder = () => nodeEventProcessor.OrderProcessedByExecutorSequence(messages, item => item.Message).Select(item => (item.Task, item.State)); } + // Ensure the IEnumerable is only enumerated one time. var orderedMessageList = getEventsInOrder().ToList(); if (!orderedMessageList.Any()) @@ -205,7 +235,7 @@ await Parallel.ForEachAsync(batchScheduler.GetEventMessagesAsync(stoppingToken), await OrchestrateTesTasksOnBatchAsync( "NodeEvent", #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously - async token => GetTesTasks(token), + async _ => GetTesTasks(), #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, orderedMessageList.Select(t => t.State).ToArray(), token), stoppingToken, @@ -224,22 +254,9 @@ await Parallel.ForEachAsync(markEventsProcessedList, stoppingToken, async (markE }); // Helpers - async IAsyncEnumerable GetTesTasks([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + IAsyncEnumerable GetTesTasks() { - foreach (var (id, @event) in orderedMessageList.Select(t => t.Message).Select(m => (m.TesTaskId, m.RunnerEventMessage.Name))) - { - TesTask tesTask = default; - if (await repository.TryGetItemAsync(id, cancellationToken, task => tesTask = task) && tesTask is not null) - { - logger.LogDebug("Completing event '{TaskEvent}' for task {TesTask}.", @event, tesTask.Id); - yield return tesTask; - } - else - { - logger.LogDebug("Could not find task {TesTask} for event '{TaskEvent}'.", id, @event); - yield return null; - } - } + return orderedMessageList.Select(t => t.Task).ToAsyncEnumerable(); } } } From d7ca8c470e69fce46ee66cb7e65f6a65222bd24e Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 30 Oct 2023 16:51:13 -0700 Subject: [PATCH 064/202] formatting --- src/TesApi.Web/Events/RunnerEventsProcessor.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index 6f9dba9ee..973175bda 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -357,7 +357,7 @@ async IAsyncEnumerable GetProcessLogs(Tes.Runner.Events.EventMessage mes var listUri = await storageAccessProvider.GetInternalTesTaskBlobUrlAsync(tesTask, string.Empty, Azure.Storage.Sas.BlobSasPermissions.List, cancellationToken); - await foreach(var uri in azureProxy.ListBlobsAsync(new(listUri), cancellationToken) + await foreach (var uri in azureProxy.ListBlobsAsync(new(listUri), cancellationToken) .Where(blob => blob.BlobName.EndsWith(".txt") && System.IO.Path.GetFileName(blob.BlobName).StartsWith(blobNameStartsWith)) .OrderBy(blob => blob.BlobName) .Select(blob => blob.BlobUri) From 1bcf3ec8834e68e46d0987dd75ae5c7aae75543c Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 30 Oct 2023 16:59:17 -0700 Subject: [PATCH 065/202] code correction and cleanup --- src/Tes/Models/TesTask.cs | 2 +- src/TesApi.Web/Storage/TerraStorageAccessProvider.cs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Tes/Models/TesTask.cs b/src/Tes/Models/TesTask.cs index 3dc59f717..809149171 100644 --- a/src/Tes/Models/TesTask.cs +++ b/src/Tes/Models/TesTask.cs @@ -116,7 +116,7 @@ public TesTask() /// Valid TES task ID public string CreateId() { - var tesTaskIdPrefix = WorkflowId is not null && Guid.TryParse(WorkflowId, out _) ? $"{WorkflowId.Substring(0, 8)}_" : string.Empty; + var tesTaskIdPrefix = WorkflowId is not null && Guid.TryParse(WorkflowId, out _) ? $"{WorkflowId[..8]}_" : string.Empty; return $"{tesTaskIdPrefix}{Guid.NewGuid():N}"; } diff --git a/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs b/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs index 6ee65dde9..0489d3b67 100644 --- a/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs @@ -153,7 +153,7 @@ public override string GetInternalTesTaskBlobUrlWithoutSasToken(TesTask task, st } //passing the resulting string through the builder to ensure that the path is properly encoded and valid - var builder = new BlobUriBuilder(new Uri($"https://{terraOptions.WorkspaceStorageAccountName}.blob.core.windows.net/{blobInfo.WsmContainerName.TrimStart('/')}/{blobInfo.BlobName.TrimStart('/')}")); + var builder = new BlobUriBuilder(new Uri($"https://{terraOptions.WorkspaceStorageAccountName}.blob.core.windows.net/{blobInfo.WsmContainerName.TrimStart('/')}{blobName}\")")); return builder.ToUri().ToString(); } @@ -275,7 +275,7 @@ private Guid ToWorkspaceId(string segmentsContainerName) { ArgumentException.ThrowIfNullOrEmpty(segmentsContainerName); - var guidString = segmentsContainerName.Substring(3); // remove the sc- prefix + var guidString = segmentsContainerName[3..]; // remove the sc- prefix return Guid.Parse(guidString); // throws if not a guid } From 341199bf22104e7c46e49b20b4a3ba137ac87cfe Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 31 Oct 2023 13:20:20 -0700 Subject: [PATCH 066/202] Process.WaitForExitAsync does not kill the process when the cancellation token is set --- src/deploy-tes-on-azure/Deployer.cs | 28 ++++++++-- src/deploy-tes-on-azure/KubernetesManager.cs | 57 ++++++++++++-------- 2 files changed, 60 insertions(+), 25 deletions(-) diff --git a/src/deploy-tes-on-azure/Deployer.cs b/src/deploy-tes-on-azure/Deployer.cs index 73076235b..dcbe3b00a 100644 --- a/src/deploy-tes-on-azure/Deployer.cs +++ b/src/deploy-tes-on-azure/Deployer.cs @@ -79,7 +79,7 @@ public class Deployer private static readonly AsyncRetryPolicy longRetryPolicy = Policy .Handle() - .WaitAndRetryAsync(60, retryAttempt => System.TimeSpan.FromSeconds(15)); + .WaitAndRetryAsync(60, retryAttempt => System.TimeSpan.FromSeconds(15), (exception, timespan) => ConsoleEx.WriteLine($"{exception.GetType().FullName}:{exception.Message}")); public const string ConfigurationContainerName = "configuration"; public const string TesInternalContainerName = "tes-internal"; @@ -626,10 +626,30 @@ await Execute( try { + var startPortForward = new Func(token => kubernetesManager.ExecKubectlProcessAsync($"port-forward -n {configuration.AksCoANamespace} svc/tes 8088:80", token, appendKubeconfig: true)); + var token = tokenSource.Token; - var portForwardTask = kubernetesManager.ExecKubectlProcessAsync($"port-forward -n {configuration.AksCoANamespace} svc/tes 8088:80", token, appendKubeconfig: true); + var portForwardTask = startPortForward(token); + var runTestTask = RunTestTask("localhost:8088", batchAccount.LowPriorityCoreQuota > 0, configuration.TesUsername, configuration.TesPassword); + + for (var task = await Task.WhenAny(portForwardTask, runTestTask); + runTestTask != task; + task = await Task.WhenAny(portForwardTask, runTestTask)) + { + try + { + await portForwardTask; + } + catch (Exception ex) + { + ConsoleEx.WriteLine($"kubectl stopped unexpectedly ({ex.Message}).", ConsoleColor.Red); + } + + ConsoleEx.WriteLine($"Restarting kubectl..."); + portForwardTask = startPortForward(token); + } - var isTestWorkflowSuccessful = await RunTestTask("localhost:8088", batchAccount.LowPriorityCoreQuota > 0, configuration.TesUsername, configuration.TesPassword); + var isTestWorkflowSuccessful = await runTestTask; exitCode = isTestWorkflowSuccessful ? 0 : 1; if (!isTestWorkflowSuccessful) @@ -913,7 +933,7 @@ private async Task ValidateAndGetExistingAKSClusterAsync() }))) .Where(a => a is not null) .SelectMany(a => a) - .SingleOrDefault(a => a.Name.Equals(serverName, StringComparison.OrdinalIgnoreCase) && regex.Replace(a.Location, "").Equals(configuration.RegionName, StringComparison.OrdinalIgnoreCase)); + .SingleOrDefault(a => a.Name.Equals(serverName, StringComparison.OrdinalIgnoreCase) && regex.Replace(a.Location, string.Empty).Equals(configuration.RegionName, StringComparison.OrdinalIgnoreCase)); } private async Task GetExistingAKSClusterAsync(string aksClusterName) diff --git a/src/deploy-tes-on-azure/KubernetesManager.cs b/src/deploy-tes-on-azure/KubernetesManager.cs index c79d98a5b..a282b7caf 100644 --- a/src/deploy-tes-on-azure/KubernetesManager.cs +++ b/src/deploy-tes-on-azure/KubernetesManager.cs @@ -163,7 +163,7 @@ public async Task EnableIngress(string tesUsername, string tesPassw V1Namespace coaNamespace = null; try { - coaNamespace = await client.CoreV1.ReadNamespaceAsync(configuration.AksCoANamespace); + coaNamespace = await client.CoreV1.ReadNamespaceAsync(configuration.AksCoANamespace, cancellationToken: cToken); } catch { } @@ -181,11 +181,11 @@ public async Task EnableIngress(string tesUsername, string tesPassw if (coaNamespace == null) { - await client.CoreV1.CreateNamespaceAsync(coaNamespaceBody); + await client.CoreV1.CreateNamespaceAsync(coaNamespaceBody, cancellationToken: cToken); } else { - await client.CoreV1.PatchNamespaceAsync(new V1Patch(coaNamespaceBody, V1Patch.PatchType.MergePatch), configuration.AksCoANamespace); + await client.CoreV1.PatchNamespaceAsync(new V1Patch(coaNamespaceBody, V1Patch.PatchType.MergePatch), configuration.AksCoANamespace, cancellationToken: cToken); } // Encryption options: https://httpd.apache.org/docs/2.4/misc/password_encryptions.html @@ -205,7 +205,7 @@ await client.CoreV1.CreateNamespacedSecretAsync(new V1Secret() { "auth", data} }, Type = "Opaque" - }, configuration.AksCoANamespace); + }, configuration.AksCoANamespace, cancellationToken: cToken); var helmRepoList = await ExecHelmProcessAsync($"repo list", cToken, workingDirectory: null, throwOnNonZeroExitCode: false); @@ -239,7 +239,7 @@ await ExecHelmProcessAsync("install cert-manager jetstack/cert-manager " + await WaitForWorkloadAsync(client, "cert-manager", configuration.AksCoANamespace, cts.Token); // Wait 10 secs before deploying TES for cert manager to finish starting. - await Task.Delay(TimeSpan.FromSeconds(10)); + await Task.Delay(TimeSpan.FromSeconds(10), cToken); return client; } @@ -575,30 +575,45 @@ void OutputHandler(object sendingProcess, DataReceivedEventArgs outLine) } var process = new Process(); - process.StartInfo.UseShellExecute = false; - process.StartInfo.RedirectStandardOutput = true; - process.StartInfo.RedirectStandardError = true; - process.StartInfo.FileName = binaryFullPath; - process.StartInfo.Arguments = command; - process.OutputDataReceived += new DataReceivedEventHandler(OutputHandler); - process.ErrorDataReceived += new DataReceivedEventHandler(OutputHandler); - - if (!string.IsNullOrWhiteSpace(workingDirectory)) + + try { - process.StartInfo.WorkingDirectory = workingDirectory; + process.StartInfo.UseShellExecute = false; + process.StartInfo.RedirectStandardOutput = true; + process.StartInfo.RedirectStandardError = true; + process.StartInfo.FileName = binaryFullPath; + process.StartInfo.Arguments = command; + process.OutputDataReceived += OutputHandler; + process.ErrorDataReceived += OutputHandler; + + if (!string.IsNullOrWhiteSpace(workingDirectory)) + { + process.StartInfo.WorkingDirectory = workingDirectory; + } + + process.Start(); + process.BeginOutputReadLine(); + process.BeginErrorReadLine(); + await process.WaitForExitAsync(cToken); + } + finally + { + if (cToken.IsCancellationRequested && !process.HasExited) + { + process.Kill(); + } } - process.Start(); - process.BeginOutputReadLine(); - process.BeginErrorReadLine(); - await process.WaitForExitAsync(cToken); var output = outputStringBuilder.ToString(); if (throwOnNonZeroExitCode && process.ExitCode != 0) { - foreach (var line in output.Split(Environment.NewLine, StringSplitOptions.RemoveEmptyEntries)) + if (!configuration.DebugLogging) // already written to console { - ConsoleEx.WriteLine($"{tag}: {line}"); + foreach (var line in output.Split(Environment.NewLine, StringSplitOptions.RemoveEmptyEntries)) + { + ConsoleEx.WriteLine($"{tag}: {line}"); + } } Debugger.Break(); From f7260d8265a4713b62a42d92d4fd5cd3feb35345 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 31 Oct 2023 15:59:57 -0700 Subject: [PATCH 067/202] Address null issues --- src/TesApi.Web/Events/RunnerEventsMessage.cs | 2 +- .../Events/RunnerEventsProcessor.cs | 39 +++++++++++++------ src/deploy-tes-on-azure/Deployer.cs | 12 +----- 3 files changed, 31 insertions(+), 22 deletions(-) diff --git a/src/TesApi.Web/Events/RunnerEventsMessage.cs b/src/TesApi.Web/Events/RunnerEventsMessage.cs index a3062c06d..dac55ad28 100644 --- a/src/TesApi.Web/Events/RunnerEventsMessage.cs +++ b/src/TesApi.Web/Events/RunnerEventsMessage.cs @@ -17,7 +17,7 @@ public record struct RunnerEventsMessage(Uri BlobUri, IDictionary /// /// - public Tes.Runner.Events.EventMessage RunnerEventMessage { get; private set; } + public Tes.Runner.Events.EventMessage RunnerEventMessage { readonly get; private set; } /// /// diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index 973175bda..9c3f5f050 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -75,7 +75,7 @@ public void ValidateMessageMetadata(RunnerEventsMessage message) throw new ArgumentException("This message was already processed.", nameof(message)); } - if (!message.Tags.ContainsKey("event-name") || !message.Tags.ContainsKey("task-id")) + if (!message.Tags.ContainsKey("event-name") || !message.Tags.ContainsKey("task-id") || !message.Tags.ContainsKey("created")) { throw new ArgumentException("This message is missing needed tags.", nameof(message)); } @@ -96,6 +96,8 @@ public void ValidateMessageMetadata(RunnerEventsMessage message) /// public async Task DownloadAndValidateMessageContentAsync(RunnerEventsMessage message, CancellationToken cancellationToken) { + ArgumentNullException.ThrowIfNull(message); + Tes.Runner.Events.EventMessage result; try @@ -109,6 +111,9 @@ public async Task DownloadAndValidateMessageContentAsync(RunnerEventsMessage mes throw new InvalidOperationException($"Event message blob is malformed. {ex.GetType().FullName}:{ex.Message}", ex); } + message.SetRunnerEventMessage(result); + + // Validate content Assert(Guid.TryParse(result.Id, out _), $"{nameof(result.Id)}('{result.Id}') is malformed."); Assert(Tes.Runner.Events.EventsPublisher.EventVersion.Equals(result.EventVersion, StringComparison.Ordinal), @@ -170,8 +175,6 @@ public async Task DownloadAndValidateMessageContentAsync(RunnerEventsMessage mes break; } - message.SetRunnerEventMessage(result); - static void Assert([System.Diagnostics.CodeAnalysis.DoesNotReturnIf(false)] bool condition, string message) { if (!condition) @@ -181,14 +184,14 @@ static void Assert([System.Diagnostics.CodeAnalysis.DoesNotReturnIf(false)] bool } } - private readonly IReadOnlyDictionary EventsInOrder = new Dictionary(StringComparer.OrdinalIgnoreCase) + private static readonly IReadOnlyDictionary EventsInOrder = new Dictionary(StringComparer.OrdinalIgnoreCase) { - { Tes.Runner.Events.EventsPublisher.DownloadStartEvent, 20 }, - { Tes.Runner.Events.EventsPublisher.DownloadEndEvent, 30 }, - { Tes.Runner.Events.EventsPublisher.ExecutorStartEvent, 40 }, - { Tes.Runner.Events.EventsPublisher.ExecutorEndEvent, 50 }, - { Tes.Runner.Events.EventsPublisher.UploadStartEvent, 60 }, - { Tes.Runner.Events.EventsPublisher.UploadEndEvent, 70 }, + { Tes.Runner.Events.EventsPublisher.DownloadStartEvent, int.MinValue }, + { Tes.Runner.Events.EventsPublisher.DownloadEndEvent, int.MinValue + 1 }, + { Tes.Runner.Events.EventsPublisher.ExecutorStartEvent, -1 }, + { Tes.Runner.Events.EventsPublisher.ExecutorEndEvent, +1 }, + { Tes.Runner.Events.EventsPublisher.UploadStartEvent, int.MaxValue - 1 }, + { Tes.Runner.Events.EventsPublisher.UploadEndEvent, int.MaxValue }, }.AsReadOnly(); /// @@ -200,7 +203,21 @@ static void Assert([System.Diagnostics.CodeAnalysis.DoesNotReturnIf(false)] bool /// public IEnumerable OrderProcessedByExecutorSequence(IEnumerable source, Func messageGetter) { - return source.OrderBy(t => messageGetter(t).RunnerEventMessage.Created).ThenBy(t => EventsInOrder.TryGetValue(messageGetter(t).RunnerEventMessage.Name, out var result) ? result : -1); + ArgumentNullException.ThrowIfNull(source); + ArgumentNullException.ThrowIfNull(messageGetter); + + return source.OrderBy(t => OrderBy(messageGetter(t))).ThenBy(t => ThenBy(messageGetter(t))); + + static DateTime OrderBy(RunnerEventsMessage message) + => message.RunnerEventMessage?.Created ?? DateTime.Parse(message.Tags["created"]).ToUniversalTime(); + + static int ThenBy(RunnerEventsMessage message) + => ParseEventName(message.RunnerEventMessage is null + ? message.Tags["event-name"] + : message.RunnerEventMessage.Name); + + static int ParseEventName(string eventName) + => EventsInOrder.TryGetValue(eventName, out var result) ? result : int.MinValue; } /// diff --git a/src/deploy-tes-on-azure/Deployer.cs b/src/deploy-tes-on-azure/Deployer.cs index dcbe3b00a..517e782cc 100644 --- a/src/deploy-tes-on-azure/Deployer.cs +++ b/src/deploy-tes-on-azure/Deployer.cs @@ -2065,24 +2065,16 @@ private async Task ValidateAndGetExistingBatchAccountAsync() throw new ValidationException($"Resource group '{configuration.VnetResourceGroupName}' does not exist."); } - var vnet = await azureSubscriptionClient.Networks.GetByResourceGroupAsync(configuration.VnetResourceGroupName, configuration.VnetName); - - if (vnet is null) - { + var vnet = await azureSubscriptionClient.Networks.GetByResourceGroupAsync(configuration.VnetResourceGroupName, configuration.VnetName) ?? throw new ValidationException($"Virtual network '{configuration.VnetName}' does not exist in resource group '{configuration.VnetResourceGroupName}'."); - } if (!vnet.RegionName.Equals(configuration.RegionName, StringComparison.OrdinalIgnoreCase)) { throw new ValidationException($"Virtual network '{configuration.VnetName}' must be in the same region that you are deploying to ({configuration.RegionName})."); } - var vmSubnet = vnet.Subnets.FirstOrDefault(s => s.Key.Equals(configuration.VmSubnetName, StringComparison.OrdinalIgnoreCase)).Value; - - if (vmSubnet is null) - { + var vmSubnet = vnet.Subnets.FirstOrDefault(s => s.Key.Equals(configuration.VmSubnetName, StringComparison.OrdinalIgnoreCase)).Value ?? throw new ValidationException($"Virtual network '{configuration.VnetName}' does not contain subnet '{configuration.VmSubnetName}'"); - } var resourceGraphClient = new ResourceGraphClient(tokenCredentials); var postgreSqlSubnet = vnet.Subnets.FirstOrDefault(s => s.Key.Equals(configuration.PostgreSqlSubnetName, StringComparison.OrdinalIgnoreCase)).Value; From d1d9c46c20cebd42a7c89241ce22cde0055e00d7 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 31 Oct 2023 17:19:31 -0700 Subject: [PATCH 068/202] address NullReferenceException --- .../Events/RunnerEventsProcessor.cs | 24 ++++++++++++------- src/TesApi.Web/Scheduler.cs | 5 ++-- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index 9c3f5f050..ff8a738d0 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -48,8 +48,6 @@ public RunnerEventsProcessor(IAzureProxy azureProxy, Storage.IStorageAccessProvi /// public void ValidateMessageMetadata(RunnerEventsMessage message) { - ArgumentNullException.ThrowIfNull(message); - if (message.BlobUri is null) { throw new ArgumentException("This message's URL is missing.", nameof(message)); @@ -96,8 +94,6 @@ public void ValidateMessageMetadata(RunnerEventsMessage message) /// public async Task DownloadAndValidateMessageContentAsync(RunnerEventsMessage message, CancellationToken cancellationToken) { - ArgumentNullException.ThrowIfNull(message); - Tes.Runner.Events.EventMessage result; try @@ -125,11 +121,11 @@ public async Task DownloadAndValidateMessageContentAsync(RunnerEventsMessage mes Assert(message.TesTaskId.Equals(result.EntityId, StringComparison.Ordinal), $"{nameof(result.EntityId)}('{result.EntityId}') does not match the expected value of '{message.TesTaskId}'."); - Assert(result.EntityId.Equals(message.Tags["task-id"], StringComparison.Ordinal), - $"{nameof(result.Name)}('{result.EntityId}') does not match the expected value of '{message.Tags["task-id"]}' from the tags.."); + Assert(message.Tags["task-id"].Equals(result.EntityId, StringComparison.Ordinal), + $"{nameof(result.EntityId)}('{result.EntityId}') does not match the expected value of '{message.Tags["task-id"]}' from the tags.."); Assert(message.Event.Equals(result.Name, StringComparison.OrdinalIgnoreCase), $"{nameof(result.Name)}('{result.Name}') does not match the expected value of '{message.Event}' from the blob path."); - Assert(result.Name.Equals(message.Tags["event-name"], StringComparison.Ordinal), + Assert(message.Tags["event-name"].Equals(result.Name, StringComparison.Ordinal), $"{nameof(result.Name)}('{result.Name}') does not match the expected value of '{message.Tags["event-name"]}' from the tags."); // Event type specific validations @@ -179,7 +175,7 @@ static void Assert([System.Diagnostics.CodeAnalysis.DoesNotReturnIf(false)] bool { if (!condition) { - throw new InvalidOperationException(message); + throw new AssertException(message); } } } @@ -229,7 +225,6 @@ static int ParseEventName(string eventName) /// public async Task GetMessageBatchStateAsync(RunnerEventsMessage message, Tes.Models.TesTask tesTask, CancellationToken cancellationToken) { - ArgumentNullException.ThrowIfNull(message); ArgumentNullException.ThrowIfNull(message.RunnerEventMessage, nameof(message)); var nodeMessage = message.RunnerEventMessage; @@ -400,5 +395,16 @@ await azureProxy.SetBlobTags( .ToDictionary(pair => pair.Key, pair => pair.Value), cancellationToken); } + + /// + /// Validation assert failed. + /// + public class AssertException : InvalidOperationException + { + /// + public AssertException(string message) : base(message) + { + } + } } } diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index 0329edd05..40b5f8017 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -182,7 +182,8 @@ await Parallel.ForEachAsync(batchScheduler.GetEventMessagesAsync(stoppingToken), { nodeEventProcessor.ValidateMessageMetadata(eventMessage); await nodeEventProcessor.DownloadAndValidateMessageContentAsync(eventMessage, cancellationToken); - messages.Add((eventMessage, tesTask, await nodeEventProcessor.GetMessageBatchStateAsync(eventMessage, tesTask, cancellationToken))); + var state = await nodeEventProcessor.GetMessageBatchStateAsync(eventMessage, tesTask, cancellationToken); + messages.Add((eventMessage, tesTask, state)); } catch (Exception ex) { @@ -193,7 +194,7 @@ await Parallel.ForEachAsync(batchScheduler.GetEventMessagesAsync(stoppingToken), $"{ex.GetType().FullName}: {ex.Message}", }))); - if (ex is System.Diagnostics.UnreachableException) // Don't retry this event. + if (ex is System.Diagnostics.UnreachableException || ex is RunnerEventsProcessor.AssertException) // Don't retry this event. { markEventsProcessedList.Add(token => nodeEventProcessor.MarkMessageProcessedAsync(eventMessage, token)); } From ed83f0109355dbf2b8c56b7682dfb6e74ab88db7 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 1 Nov 2023 10:41:12 -0700 Subject: [PATCH 069/202] address NullReferenceException again --- src/TesApi.Web/Events/RunnerEventsMessage.cs | 28 +++---- .../Events/RunnerEventsProcessor.cs | 76 ++++++++++--------- src/TesApi.Web/Scheduler.cs | 2 +- 3 files changed, 49 insertions(+), 57 deletions(-) diff --git a/src/TesApi.Web/Events/RunnerEventsMessage.cs b/src/TesApi.Web/Events/RunnerEventsMessage.cs index dac55ad28..033979ba6 100644 --- a/src/TesApi.Web/Events/RunnerEventsMessage.cs +++ b/src/TesApi.Web/Events/RunnerEventsMessage.cs @@ -12,32 +12,22 @@ namespace TesApi.Web.Events /// URL of the event message. /// Tags on the event message blob. /// Name of the event based on parsing the blob's BlobName. - public record struct RunnerEventsMessage(Uri BlobUri, IDictionary Tags, string Event) + /// The content of the event message. + public record struct RunnerEventsMessage(Uri BlobUri, IDictionary Tags, string Event, Tes.Runner.Events.EventMessage RunnerEventMessage = default) { /// - /// + /// Copy constructor replacing . /// - public Tes.Runner.Events.EventMessage RunnerEventMessage { readonly get; private set; } + /// + /// + public RunnerEventsMessage(RunnerEventsMessage original, Tes.Runner.Events.EventMessage runnerEventMessage) + : this(original.BlobUri, original.Tags, original.Event, runnerEventMessage) + { + } /// /// /// public readonly string TesTaskId => RunnerEventMessage?.EntityId; - - /// - /// Sets . - /// - /// The downloaded event message associated with this storage blob. - public void SetRunnerEventMessage(Tes.Runner.Events.EventMessage eventMessage) - { - ArgumentNullException.ThrowIfNull(eventMessage); - - if (RunnerEventMessage is not null) - { - throw new InvalidOperationException("RunnerEventMessage has already been set."); - } - - RunnerEventMessage = eventMessage; - } } } diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index ff8a738d0..220ae6d3b 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -92,14 +92,14 @@ public void ValidateMessageMetadata(RunnerEventsMessage message) /// /// /// - public async Task DownloadAndValidateMessageContentAsync(RunnerEventsMessage message, CancellationToken cancellationToken) + public async Task DownloadAndValidateMessageContentAsync(RunnerEventsMessage message, CancellationToken cancellationToken) { - Tes.Runner.Events.EventMessage result; + Tes.Runner.Events.EventMessage content; try { var messageText = await azureProxy.DownloadBlobAsync(message.BlobUri, cancellationToken); - result = System.Text.Json.JsonSerializer.Deserialize(messageText) + content = System.Text.Json.JsonSerializer.Deserialize(messageText) ?? throw new InvalidOperationException("Deserialize() returned null."); } catch (Exception ex) @@ -107,70 +107,72 @@ public async Task DownloadAndValidateMessageContentAsync(RunnerEventsMessage mes throw new InvalidOperationException($"Event message blob is malformed. {ex.GetType().FullName}:{ex.Message}", ex); } - message.SetRunnerEventMessage(result); + message = new(message, content); // Validate content - Assert(Guid.TryParse(result.Id, out _), - $"{nameof(result.Id)}('{result.Id}') is malformed."); - Assert(Tes.Runner.Events.EventsPublisher.EventVersion.Equals(result.EventVersion, StringComparison.Ordinal), - $"{nameof(result.EventVersion)}('{result.EventVersion}') is not recognized."); - Assert(Tes.Runner.Events.EventsPublisher.EventDataVersion.Equals(result.EventDataVersion, StringComparison.Ordinal), - $"{nameof(result.EventDataVersion)}('{result.EventDataVersion}') is not recognized."); - Assert(Tes.Runner.Events.EventsPublisher.TesTaskRunnerEntityType.Equals(result.EntityType, StringComparison.Ordinal), - $"{nameof(result.EntityType)}('{result.EntityType}') is not recognized."); - - Assert(message.TesTaskId.Equals(result.EntityId, StringComparison.Ordinal), - $"{nameof(result.EntityId)}('{result.EntityId}') does not match the expected value of '{message.TesTaskId}'."); - Assert(message.Tags["task-id"].Equals(result.EntityId, StringComparison.Ordinal), - $"{nameof(result.EntityId)}('{result.EntityId}') does not match the expected value of '{message.Tags["task-id"]}' from the tags.."); - Assert(message.Event.Equals(result.Name, StringComparison.OrdinalIgnoreCase), - $"{nameof(result.Name)}('{result.Name}') does not match the expected value of '{message.Event}' from the blob path."); - Assert(message.Tags["event-name"].Equals(result.Name, StringComparison.Ordinal), - $"{nameof(result.Name)}('{result.Name}') does not match the expected value of '{message.Tags["event-name"]}' from the tags."); + Assert(Guid.TryParse(content.Id, out _), + $"{nameof(content.Id)}('{content.Id}') is malformed."); + Assert(Tes.Runner.Events.EventsPublisher.EventVersion.Equals(content.EventVersion, StringComparison.Ordinal), + $"{nameof(content.EventVersion)}('{content.EventVersion}') is not recognized."); + Assert(Tes.Runner.Events.EventsPublisher.EventDataVersion.Equals(content.EventDataVersion, StringComparison.Ordinal), + $"{nameof(content.EventDataVersion)}('{content.EventDataVersion}') is not recognized."); + Assert(Tes.Runner.Events.EventsPublisher.TesTaskRunnerEntityType.Equals(content.EntityType, StringComparison.Ordinal), + $"{nameof(content.EntityType)}('{content.EntityType}') is not recognized."); + + Assert(message.TesTaskId.Equals(content.EntityId, StringComparison.Ordinal), + $"{nameof(content.EntityId)}('{content.EntityId}') does not match the expected value of '{message.TesTaskId}'."); + Assert(message.Tags["task-id"].Equals(content.EntityId, StringComparison.Ordinal), + $"{nameof(content.EntityId)}('{content.EntityId}') does not match the expected value of '{message.Tags["task-id"]}' from the tags.."); + Assert(message.Event.Equals(content.Name, StringComparison.OrdinalIgnoreCase), + $"{nameof(content.Name)}('{content.Name}') does not match the expected value of '{message.Event}' from the blob path."); + Assert(message.Tags["event-name"].Equals(content.Name, StringComparison.Ordinal), + $"{nameof(content.Name)}('{content.Name}') does not match the expected value of '{message.Tags["event-name"]}' from the tags."); // Event type specific validations - switch (result.Name) + switch (content.Name) { case Tes.Runner.Events.EventsPublisher.DownloadStartEvent: - Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(result.StatusMessage, StringComparison.Ordinal), - $"{nameof(result.StatusMessage)}('{result.StatusMessage}') does not match the expected value of '{Tes.Runner.Events.EventsPublisher.StartedStatus}'."); + Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(content.StatusMessage, StringComparison.Ordinal), + $"{nameof(content.StatusMessage)}('{content.StatusMessage}') does not match the expected value of '{Tes.Runner.Events.EventsPublisher.StartedStatus}'."); break; case Tes.Runner.Events.EventsPublisher.DownloadEndEvent: - Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage), - $"{nameof(result.StatusMessage)}('{result.StatusMessage}') does not match one of the expected valued of '{Tes.Runner.Events.EventsPublisher.SuccessStatus}' or '{Tes.Runner.Events.EventsPublisher.FailedStatus}'."); + Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(content.StatusMessage), + $"{nameof(content.StatusMessage)}('{content.StatusMessage}') does not match one of the expected valued of '{Tes.Runner.Events.EventsPublisher.SuccessStatus}' or '{Tes.Runner.Events.EventsPublisher.FailedStatus}'."); break; case Tes.Runner.Events.EventsPublisher.UploadStartEvent: - Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(result.StatusMessage, StringComparison.Ordinal), - $"{nameof(result.StatusMessage)}('{result.StatusMessage}') does not match the expected value of '{Tes.Runner.Events.EventsPublisher.StartedStatus}'."); + Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(content.StatusMessage, StringComparison.Ordinal), + $"{nameof(content.StatusMessage)}('{content.StatusMessage}') does not match the expected value of '{Tes.Runner.Events.EventsPublisher.StartedStatus}'."); break; case Tes.Runner.Events.EventsPublisher.UploadEndEvent: - Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage), - $"{nameof(result.StatusMessage)}('{result.StatusMessage}') does not match one of the expected valued of '{Tes.Runner.Events.EventsPublisher.SuccessStatus}' or '{Tes.Runner.Events.EventsPublisher.FailedStatus}'."); + Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(content.StatusMessage), + $"{nameof(content.StatusMessage)}('{content.StatusMessage}') does not match one of the expected valued of '{Tes.Runner.Events.EventsPublisher.SuccessStatus}' or '{Tes.Runner.Events.EventsPublisher.FailedStatus}'."); break; case Tes.Runner.Events.EventsPublisher.ExecutorStartEvent: - Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(result.StatusMessage, StringComparison.Ordinal), - $"{nameof(result.StatusMessage)}('{result.StatusMessage}') does not match the expected value of '{Tes.Runner.Events.EventsPublisher.StartedStatus}'."); + Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(content.StatusMessage, StringComparison.Ordinal), + $"{nameof(content.StatusMessage)}('{content.StatusMessage}') does not match the expected value of '{Tes.Runner.Events.EventsPublisher.StartedStatus}'."); break; case Tes.Runner.Events.EventsPublisher.ExecutorEndEvent: - Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage), - $"{nameof(result.StatusMessage)}('{result.StatusMessage}') does not match one of the expected valued of '{Tes.Runner.Events.EventsPublisher.SuccessStatus}' or '{Tes.Runner.Events.EventsPublisher.FailedStatus}'."); + Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(content.StatusMessage), + $"{nameof(content.StatusMessage)}('{content.StatusMessage}') does not match one of the expected valued of '{Tes.Runner.Events.EventsPublisher.SuccessStatus}' or '{Tes.Runner.Events.EventsPublisher.FailedStatus}'."); break; case Tes.Runner.Events.EventsPublisher.TaskCompletionEvent: - Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(result.StatusMessage), - $"{nameof(result.StatusMessage)}('{result.StatusMessage}') does not match one of the expected valued of '{Tes.Runner.Events.EventsPublisher.SuccessStatus}' or '{Tes.Runner.Events.EventsPublisher.FailedStatus}'."); + Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(content.StatusMessage), + $"{nameof(content.StatusMessage)}('{content.StatusMessage}') does not match one of the expected valued of '{Tes.Runner.Events.EventsPublisher.SuccessStatus}' or '{Tes.Runner.Events.EventsPublisher.FailedStatus}'."); break; default: - Assert(false, $"{nameof(result.Name)}('{result.Name}') is not recognized."); + Assert(false, $"{nameof(content.Name)}('{content.Name}') is not recognized."); break; } + return message; + static void Assert([System.Diagnostics.CodeAnalysis.DoesNotReturnIf(false)] bool condition, string message) { if (!condition) diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/Scheduler.cs index 40b5f8017..a6b29147f 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/Scheduler.cs @@ -181,7 +181,7 @@ await Parallel.ForEachAsync(batchScheduler.GetEventMessagesAsync(stoppingToken), try { nodeEventProcessor.ValidateMessageMetadata(eventMessage); - await nodeEventProcessor.DownloadAndValidateMessageContentAsync(eventMessage, cancellationToken); + eventMessage = await nodeEventProcessor.DownloadAndValidateMessageContentAsync(eventMessage, cancellationToken); var state = await nodeEventProcessor.GetMessageBatchStateAsync(eventMessage, tesTask, cancellationToken); messages.Add((eventMessage, tesTask, state)); } From 28364dddb947d69be22c1cbf41e2b7c322e1a6ce Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 2 Nov 2023 14:33:47 -0700 Subject: [PATCH 070/202] Fix two exceptions and some code cleanup --- src/TesApi.Web/AzureProxy.cs | 31 +++++++------- src/TesApi.Web/BatchPool.cs | 40 +++++++++++-------- src/TesApi.Web/BatchPoolService.cs | 4 +- src/TesApi.Web/BatchPools.BatchScheduler.cs | 12 ++---- src/TesApi.Web/BatchScheduler.cs | 2 +- .../CachingWithRetriesAzureProxy.cs | 2 +- src/TesApi.Web/IAzureProxy.cs | 2 +- src/TesApi.Web/IBatchPool.cs | 8 ++-- 8 files changed, 54 insertions(+), 47 deletions(-) diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 9757627fd..58c838c98 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -9,6 +9,7 @@ using System.Threading; using System.Threading.Tasks; using Azure.Storage.Blobs; +using Azure.Storage.Blobs.Models; using Azure.Storage.Blobs.Specialized; using Microsoft.Azure.Batch; using Microsoft.Azure.Batch.Auth; @@ -22,7 +23,6 @@ using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Microsoft.Rest; -using Microsoft.WindowsAzure.Storage.Blob; using Polly; using Polly.Retry; using TesApi.Web.Extensions; @@ -369,29 +369,32 @@ public async Task GetStorageAccountKeyAsync(StorageAccountInfo storageAc /// public Task UploadBlobAsync(Uri blobAbsoluteUri, string content, CancellationToken cancellationToken) - => new CloudBlockBlob(blobAbsoluteUri).UploadTextAsync(content, null, null, null, null, cancellationToken); + => new BlobClient(blobAbsoluteUri, new(BlobClientOptions.ServiceVersion.V2021_04_10)) + .UploadAsync(BinaryData.FromString(content), options: null, cancellationToken); /// public Task UploadBlobFromFileAsync(Uri blobAbsoluteUri, string filePath, CancellationToken cancellationToken) - => new CloudBlockBlob(blobAbsoluteUri).UploadFromFileAsync(filePath, null, null, null, cancellationToken); + => new BlobClient(blobAbsoluteUri, new(BlobClientOptions.ServiceVersion.V2021_04_10)) + .UploadAsync(filePath, options: null, cancellationToken); /// - public Task DownloadBlobAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) - => new CloudBlockBlob(blobAbsoluteUri).DownloadTextAsync(null, null, null, null, cancellationToken); + public async Task DownloadBlobAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) + => (await new BlobClient(blobAbsoluteUri, new(BlobClientOptions.ServiceVersion.V2021_04_10)) + .DownloadContentAsync(cancellationToken)).Value.Content.ToString(); /// - public Task BlobExistsAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) - => new CloudBlockBlob(blobAbsoluteUri).ExistsAsync(null, null, cancellationToken); + public async Task BlobExistsAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) + => await new BlobClient(blobAbsoluteUri, new(BlobClientOptions.ServiceVersion.V2021_04_10)) + .ExistsAsync(cancellationToken); /// public async Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) { - var blob = new CloudBlockBlob(blobAbsoluteUri); + var blob = new BlobClient(blobAbsoluteUri, new(BlobClientOptions.ServiceVersion.V2021_04_10)); - if (await blob.ExistsAsync(null, null, cancellationToken)) + if (await blob.ExistsAsync(cancellationToken)) { - await blob.FetchAttributesAsync(null, null, null, cancellationToken); - return blob.Properties; + return await blob.GetPropertiesAsync(cancellationToken: cancellationToken); } return default; @@ -400,18 +403,18 @@ public async Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, Ca /// public IAsyncEnumerable ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) { - var directory = (new BlobClient(directoryUri, new(BlobClientOptions.ServiceVersion.V2021_04_10))); + var directory = new BlobClient(directoryUri, new(BlobClientOptions.ServiceVersion.V2021_04_10)); return directory.GetParentBlobContainerClient() .GetBlobsAsync(prefix: directory.Name.TrimEnd('/') + "/", cancellationToken: cancellationToken) .Select(blobItem => new BlobNameAndUri(blobItem.Name, new BlobUriBuilder(directory.Uri) { Sas = null, BlobName = blobItem.Name, Query = null }.ToUri())); } /// - public IAsyncEnumerable ListBlobsWithTagsAsync(Uri containerUri, string prefix, CancellationToken cancellationToken) + public IAsyncEnumerable ListBlobsWithTagsAsync(Uri containerUri, string prefix, CancellationToken cancellationToken) { BlobContainerClient container = new(containerUri, new(BlobClientOptions.ServiceVersion.V2021_04_10)); - return container.GetBlobsAsync(Azure.Storage.Blobs.Models.BlobTraits.Tags, prefix: prefix, cancellationToken: cancellationToken); + return container.GetBlobsAsync(BlobTraits.Tags, prefix: prefix, cancellationToken: cancellationToken); } /// diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index c01d86232..80335740a 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -347,19 +347,27 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc async Task SendNodeErrorData(string nodeId, IReadOnlyList content) { - var url = await _storageAccessProvider.GetInternalTesBlobUrlAsync( - $"nodeError/{nodeId}-{new Guid():B}", + var url = new Uri(await _storageAccessProvider.GetInternalTesBlobUrlAsync( + $"nodeError/{nodeId}-{new Guid():N}", Azure.Storage.Sas.BlobSasPermissions.Create, - cancellationToken); - await _azureProxy.UploadBlobAsync( - new(url), - System.Text.Json.JsonSerializer.Serialize(content, - new System.Text.Json.JsonSerializerOptions() - { - DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingDefault, - Converters = { new System.Text.Json.Serialization.JsonStringEnumConverter(System.Text.Json.JsonNamingPolicy.CamelCase) } - }), - cancellationToken); + cancellationToken)); + + if (content is null || content!.Any()) + { + await _azureProxy.UploadBlobAsync(url, "No recent tasks found on node.", cancellationToken); + } + else + { + await _azureProxy.UploadBlobAsync( + url, + System.Text.Json.JsonSerializer.Serialize(content, + new System.Text.Json.JsonSerializerOptions() + { + DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingDefault, + Converters = { new System.Text.Json.Serialization.JsonStringEnumConverter(System.Text.Json.JsonNamingPolicy.CamelCase) } + }), + cancellationToken); + } } // It's documented that a max of 100 nodes can be removed at a time. Excess eligible nodes will be removed in a future call to this method. @@ -517,7 +525,7 @@ internal enum ServiceKind public string Id { get; private set; } /// - public async ValueTask CanBeDeleted(CancellationToken cancellationToken = default) + public async ValueTask CanBeDeletedAsync(CancellationToken cancellationToken = default) { if (await GetTasksAsync(includeCompleted: true).AnyAsync(cancellationToken)) { @@ -647,7 +655,7 @@ async ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken ca } /// - public IAsyncEnumerable GetTaskResizeFailures(CancellationToken cancellationToken) + public IAsyncEnumerable GetTaskResizeFailuresAsync(CancellationToken cancellationToken) { return GetTasksAsync("id", "state eq 'active'").Zip( GetFailures(cancellationToken), @@ -693,11 +701,11 @@ TaskFailureInformation PopNextStartTaskFailure() } /// - public IAsyncEnumerable GetCompletedTasks(CancellationToken _1) + public IAsyncEnumerable GetCompletedTasksAsync(CancellationToken _1) => GetTasksAsync("id,executionInfo", $"state eq 'completed' and stateTransitionTime lt DateTime'{DateTime.UtcNow - TimeSpan.FromMinutes(2):O}'"); /// - public async ValueTask GetAllocationStateTransitionTime(CancellationToken cancellationToken = default) + public async ValueTask GetAllocationStateTransitionTimeAsync(CancellationToken cancellationToken = default) => (await _azureProxy.GetBatchPoolAsync(Id, cancellationToken, new ODATADetailLevel { SelectClause = "allocationStateTransitionTime" })).AllocationStateTransitionTime ?? DateTime.UtcNow; /// diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index 76b1a67d6..88706250b 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -110,7 +110,7 @@ await ExecuteActionOnPoolsAsync( async (pool, token) => { await pool.ServicePoolAsync(token); - await ProcessFailures(pool.GetTaskResizeFailures(token), token); + await ProcessFailures(pool.GetTaskResizeFailuresAsync(token), token); }, stoppingToken); @@ -159,7 +159,7 @@ private async ValueTask ProcessCompletedCloudTasksAsync(CancellationToken stoppi { var tasks = new ConcurrentBag(); - await ExecuteActionOnPoolsAsync("Service Batch Tasks", async (pool, token) => await pool.GetCompletedTasks(token).ForEachAsync(tasks.Add, token), stoppingToken); + await ExecuteActionOnPoolsAsync("Service Batch Tasks", async (pool, token) => await pool.GetCompletedTasksAsync(token).ForEachAsync(tasks.Add, token), stoppingToken); if (tasks.IsEmpty) { diff --git a/src/TesApi.Web/BatchPools.BatchScheduler.cs b/src/TesApi.Web/BatchPools.BatchScheduler.cs index 845f5158c..a16de041f 100644 --- a/src/TesApi.Web/BatchPools.BatchScheduler.cs +++ b/src/TesApi.Web/BatchPools.BatchScheduler.cs @@ -185,16 +185,12 @@ public async ValueTask FlushPoolsAsync(IEnumerable assignedPools, Cancel try { - var pools = (await batchPools.GetAllPools() - .ToAsyncEnumerable() - .WhereAwait(async p => await p.CanBeDeleted(cancellationToken)) - .ToListAsync(cancellationToken)) + await foreach (var pool in batchPools.GetAllPools().ToAsyncEnumerable() + .WhereAwait(async p => await p.CanBeDeletedAsync(cancellationToken)) .Where(p => !assignedPools.Contains(p.Id)) - .OrderBy(p => p.GetAllocationStateTransitionTime(cancellationToken)) + .OrderByAwait(p => p.GetAllocationStateTransitionTimeAsync(cancellationToken)) .Take(neededPools.Count) - .ToList(); - - foreach (var pool in pools) + .WithCancellation(cancellationToken)) { await DeletePoolAsync(pool, cancellationToken); _ = RemovePoolFromList(pool); diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 5cd1a6b49..b027b2b3a 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -430,7 +430,7 @@ public async Task UploadTaskRunnerIfNeeded(CancellationToken cancellationToken) { var blobUri = new Uri(await storageAccessProvider.GetInternalTesBlobUrlAsync(NodeTaskRunnerFilename, storageAccessProvider.BlobPermissionsWithWrite, cancellationToken)); var blobProperties = await azureProxy.GetBlobPropertiesAsync(blobUri, cancellationToken); - if (!(await File.ReadAllTextAsync(Path.Combine(AppContext.BaseDirectory, $"scripts/{NodeTaskRunnerMD5HashFilename}"), cancellationToken)).Trim().Equals(blobProperties?.ContentMD5, StringComparison.OrdinalIgnoreCase)) + if (!(await File.ReadAllTextAsync(Path.Combine(AppContext.BaseDirectory, $"scripts/{NodeTaskRunnerMD5HashFilename}"), cancellationToken)).Trim().Equals(blobProperties is null ? string.Empty : Convert.ToBase64String(blobProperties.ContentHash), StringComparison.OrdinalIgnoreCase)) { await azureProxy.UploadBlobFromFileAsync(blobUri, $"scripts/{NodeTaskRunnerFilename}", cancellationToken); } diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 8dd3a1579..5243ec0c7 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -254,7 +254,7 @@ public Task UploadBlobFromFileAsync(Uri blobAbsoluteUri, string filePath, Cancel } /// - public Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) + public Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) { var ctx = new Context(); ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetBlobPropertiesAsync ({RetryCount}).", retryCount)); diff --git a/src/TesApi.Web/IAzureProxy.cs b/src/TesApi.Web/IAzureProxy.cs index 03b5bc022..aa560421e 100644 --- a/src/TesApi.Web/IAzureProxy.cs +++ b/src/TesApi.Web/IAzureProxy.cs @@ -145,7 +145,7 @@ public interface IAzureProxy /// Absolute Blob URI /// A for controlling the lifetime of the asynchronous operation. /// - Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken); + Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken); /// /// List blobs whose tags match a given search expression in the given directory. diff --git a/src/TesApi.Web/IBatchPool.cs b/src/TesApi.Web/IBatchPool.cs index 936abb90e..92e5db5e9 100644 --- a/src/TesApi.Web/IBatchPool.cs +++ b/src/TesApi.Web/IBatchPool.cs @@ -44,7 +44,7 @@ public interface IBatchPool /// Indicates that the pool is not scheduled to run tasks nor running tasks. /// /// - ValueTask CanBeDeleted(CancellationToken cancellationToken = default); + ValueTask CanBeDeletedAsync(CancellationToken cancellationToken = default); /// /// Updates this instance based on changes to its environment. @@ -58,21 +58,21 @@ public interface IBatchPool /// /// /// - IAsyncEnumerable GetTaskResizeFailures(CancellationToken cancellationToken = default); + IAsyncEnumerable GetTaskResizeFailuresAsync(CancellationToken cancellationToken = default); /// /// Gets the last time the pool's compute node list was changed. /// /// /// - ValueTask GetAllocationStateTransitionTime(CancellationToken cancellationToken = default); + ValueTask GetAllocationStateTransitionTimeAsync(CancellationToken cancellationToken = default); /// /// Gets the completed tasks in this pool's associated job. /// /// /// - IAsyncEnumerable GetCompletedTasks(CancellationToken cancellationToken); + IAsyncEnumerable GetCompletedTasksAsync(CancellationToken cancellationToken); /// /// A not yet assigned a compute nodes to remove due to a nonrecoverable compute node or pool resize error. From 774a2e398168179b4f05c454b9f8810eaa707b0f Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 2 Nov 2023 23:55:19 -0700 Subject: [PATCH 071/202] Retry logging and reduction of errors when pools are removed --- .../CacheAndRetryHandlerTest.cs | 4 +- .../TerraLandingZoneApiClientTest.cs | 17 +- .../TerraWsmApiClientTests.cs | 9 + src/Tes.ApiClients/HttpApiClient.cs | 77 ++++++-- src/Tes.ApiClients/RetryHandler.cs | 55 +++--- src/Tes.ApiClients/TerraWsmApiClient.cs | 5 +- src/TesApi.Tests/BatchPoolTests.cs | 2 - src/TesApi.Web/AzureProxy.cs | 4 - src/TesApi.Web/BatchPool.cs | 21 +- src/TesApi.Web/BatchPools.BatchScheduler.cs | 13 +- .../CachingWithRetriesAzureProxy.cs | 180 +++++++++++------- .../Extensions/PagedInterfaceExtensions.cs | 5 +- src/TesApi.Web/IAzureProxy.cs | 16 -- src/TesApi.Web/IBatchPool.cs | 5 + .../Management/ContainerRegistryProvider.cs | 4 +- 15 files changed, 268 insertions(+), 149 deletions(-) diff --git a/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs b/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs index 19c8e0ac4..efb641858 100644 --- a/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs +++ b/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs @@ -92,7 +92,7 @@ public async Task ExecuteHttpRequestWithRetryAsync_RetriesOnlyOnExpectedFailureC mockFactory.Setup(f => f.CreateResponseAsync()).Returns(CreateResponseAsync(statusCode)); var response = - await cachingRetryHandler.ExecuteHttpRequestWithRetryAsync(_ => + await cachingRetryHandler.ExecuteWithRetryAsync(_ => mockFactory.Object.CreateResponseAsync(), System.Threading.CancellationToken.None); @@ -150,7 +150,7 @@ await cachingRetryHandler.ExecuteHttpRequestWithRetryAsync(_ => // Assert.IsFalse(appCache.TryGetValue(cacheKey, out HttpResponseMessage _)); // } - private Task CreateResponseAsync(HttpStatusCode statusCode) + private static Task CreateResponseAsync(HttpStatusCode statusCode) => Task.FromResult(new(statusCode)); public interface ITestHttpResponseMessageFactory diff --git a/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs b/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs index e07ca72c0..7e975ec6d 100644 --- a/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs +++ b/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs @@ -21,6 +21,9 @@ public void SetUp() terraApiStubData = new TerraApiStubData(); tokenCredential = new Mock(); cacheAndRetryHandler = new Mock(); + var cache = new Mock(); + cache.Setup(c => c.CreateEntry(It.IsAny())).Returns(new Mock().Object); + cacheAndRetryHandler.SetupGet(c => c.AppCache).Returns(cache.Object); terraLandingZoneApiClient = new TerraLandingZoneApiClient(TerraApiStubData.LandingZoneApiHost, tokenCredential.Object, cacheAndRetryHandler.Object, NullLogger.Instance); } @@ -28,7 +31,12 @@ public void SetUp() public async Task GetResourceQuotaAsync_ValidResourceIdReturnsQuotaInformationAndGetsAuthToken() { var body = terraApiStubData.GetResourceQuotaApiResponseInJson(); - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAndCachingAsync(It.IsAny(), + + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync( + It.IsAny>>(), It.IsAny(), It.IsAny())) + .ReturnsAsync(new HttpResponseMessage()); + + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync( It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(body); @@ -55,7 +63,11 @@ public async Task GetLandingZoneResourcesAsync_ListOfLandingZoneResourcesAndGets { var body = terraApiStubData.GetResourceApiResponseInJson(); - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAndCachingAsync(It.IsAny(), + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync( + It.IsAny>>(), It.IsAny(), It.IsAny())) + .ReturnsAsync(new HttpResponseMessage()); + + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync( It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(body); @@ -67,7 +79,6 @@ public async Task GetLandingZoneResourcesAsync_ListOfLandingZoneResourcesAndGets tokenCredential.Verify(t => t.GetTokenAsync(It.IsAny(), It.IsAny()), Times.Once); - } [TestMethod] diff --git a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs index 791d45c3c..3e6bf460e 100644 --- a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs +++ b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs @@ -25,6 +25,9 @@ public void SetUp() terraApiStubData = new TerraApiStubData(); tokenCredential = new Mock(); cacheAndRetryHandler = new Mock(); + var cache = new Mock(); + cache.Setup(c => c.CreateEntry(It.IsAny())).Returns(new Mock().Object); + cacheAndRetryHandler.SetupGet(c => c.AppCache).Returns(cache.Object); terraWsmApiClient = new TerraWsmApiClient(TerraApiStubData.WsmApiHost, tokenCredential.Object, cacheAndRetryHandler.Object, NullLogger.Instance); } @@ -92,6 +95,9 @@ public async Task GetSasTokenAsync_ValidRequest_ReturnsPayload() cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + .Returns((Func> action, CancellationToken cancellationToken, Polly.Context _2) => action(cancellationToken)); + var apiResponse = await terraWsmApiClient.GetSasTokenAsync(terraApiStubData.WorkspaceId, terraApiStubData.ContainerResourceId, null!, CancellationToken.None); @@ -111,6 +117,9 @@ public async Task GetContainerResourcesAsync_ValidRequest_ReturnsPayload() cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + .Returns((Func> action, CancellationToken cancellationToken, Polly.Context _2) => action(cancellationToken)); + var apiResponse = await terraWsmApiClient.GetContainerResourcesAsync(terraApiStubData.WorkspaceId, offset: 0, limit: 10, CancellationToken.None); diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index 26e482ba0..9a7724eda 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -6,6 +6,7 @@ using System.Text; using System.Text.Json; using Azure.Core; +using Microsoft.Extensions.Caching.Memory; using Microsoft.Extensions.Logging; namespace Tes.ApiClients @@ -68,6 +69,25 @@ protected HttpApiClient(TokenCredential tokenCredential, string tokenScope, /// protected HttpApiClient() { } + private Action LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string? caller = default) + => new((exception, retryCount, timeSpan) => + { + Logger?.LogError(exception, @"Retrying {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan}", caller, retryCount, timeSpan); + }); + + private Action, TimeSpan, int> LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string? caller = default) + => new((result, retryCount, timeSpan) => + { + if (result.Exception is null) + { + Logger?.LogError(@"Retrying {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan}", caller, retryCount, timeSpan); + } + else + { + Logger?.LogError(result.Exception, @"Retrying {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan}", caller, retryCount, timeSpan); + } + }); + /// /// Sends request with a retry policy /// @@ -78,7 +98,10 @@ protected HttpApiClient() { } /// protected async Task HttpSendRequestWithRetryPolicyAsync( Func httpRequestFactory, CancellationToken cancellationToken, bool setAuthorizationHeader = false) - => await cachingRetryHandler.ExecuteWithRetryAsync(async ct => + { + var ctx = new Polly.Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return await cachingRetryHandler.ExecuteWithRetryAsync(async ct => { var request = httpRequestFactory(); if (setAuthorizationHeader) @@ -87,7 +110,8 @@ protected async Task HttpSendRequestWithRetryPolicyAsync( } return await HttpClient.SendAsync(request, ct); - }, cancellationToken); + }, cancellationToken, ctx); + } /// /// Sends a Http Get request to the URL and returns body response as string @@ -136,12 +160,20 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri re { var cacheKey = await ToCacheKeyAsync(requestUrl, setAuthorizationHeader, cancellationToken); - return await cachingRetryHandler.ExecuteWithRetryAndCachingAsync(cacheKey, async ct => + return (await cachingRetryHandler.AppCache.GetOrCreateAsync(cacheKey, async _ => { - var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); + var ctx = new Polly.Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + var response = await cachingRetryHandler.ExecuteWithRetryAsync(async ct => + { + var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); + + var httpResponse = await HttpClient.SendAsync(httpRequest, ct); + return httpResponse.EnsureSuccessStatusCode(); + }, cancellationToken, ctx); - return await ExecuteRequestAndReadResponseBodyAsync(httpRequest, ct); - }, cancellationToken); + return await ReadResponseBodyAsync(response, cancellationToken); + }))!; } /// @@ -153,13 +185,20 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri re /// protected async Task HttpGetRequestWithRetryPolicyAsync(Uri requestUrl, CancellationToken cancellationToken, bool setAuthorizationHeader = false) - => await cachingRetryHandler.ExecuteWithRetryAsync(async ct => + { + var ctx = new Polly.Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + var response = await cachingRetryHandler.ExecuteWithRetryAsync(async ct => { //request must be recreated in every retry. var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); - return await ExecuteRequestAndReadResponseBodyAsync(httpRequest, ct); - }, cancellationToken); + var httpResponse = await HttpClient.SendAsync(httpRequest, ct); + return httpResponse.EnsureSuccessStatusCode(); + }, cancellationToken, ctx); + + return await ReadResponseBodyAsync(response, cancellationToken); + } /// /// Returns an query string key-value, with the value escaped. If the value is null or empty returns an empty string @@ -167,7 +206,7 @@ protected async Task HttpGetRequestWithRetryPolicyAsync(Uri requestUrl, /// parameter name /// parameter value /// - protected string ParseQueryStringParameter(string name, string value) + protected static string ParseQueryStringParameter(string name, string value) { ArgumentException.ThrowIfNullOrEmpty(name); @@ -184,7 +223,7 @@ protected string ParseQueryStringParameter(string name, string value) /// /// /// - protected string AppendQueryStringParams(params string[] arguments) + protected static string AppendQueryStringParams(params string[] arguments) { if (arguments.Length == 0) { @@ -218,13 +257,11 @@ private async Task CreateGetHttpRequest(Uri requestUrl, bool return httpRequest; } - private static async Task ExecuteRequestAndReadResponseBodyAsync(HttpRequestMessage request, CancellationToken cancellationToken) + protected async Task ReadResponseBodyAsync(HttpResponseMessage response, CancellationToken cancellationToken) { - var response = await HttpClient.SendAsync(request, cancellationToken); - - response.EnsureSuccessStatusCode(); - - return await response.Content.ReadAsStringAsync(cancellationToken); + var ctx = new Polly.Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return await cachingRetryHandler.ExecuteWithRetryAsync(response.Content.ReadAsStringAsync, cancellationToken, ctx); } private async Task AddAuthorizationHeaderToRequestAsync(HttpRequestMessage requestMessage, CancellationToken cancellationToken) @@ -303,7 +340,7 @@ private string ToHash(string input) { var hash = sha256.ComputeHash(Encoding.ASCII.GetBytes(input)); - return hash.Aggregate("", (current, t) => current + t.ToString("X2")); + return hash.Aggregate(string.Empty, (current, t) => current + t.ToString("X2")); } /// @@ -313,11 +350,11 @@ private string ToHash(string input) /// A for controlling the lifetime of the asynchronous operation. /// Response's content deserialization type /// - protected static async Task GetApiResponseContentAsync(HttpResponseMessage response, CancellationToken cancellationToken) + protected async Task GetApiResponseContentAsync(HttpResponseMessage response, CancellationToken cancellationToken) { response.EnsureSuccessStatusCode(); - return JsonSerializer.Deserialize(await response.Content.ReadAsStringAsync(cancellationToken))!; + return JsonSerializer.Deserialize(await ReadResponseBodyAsync(response, cancellationToken))!; } } } diff --git a/src/Tes.ApiClients/RetryHandler.cs b/src/Tes.ApiClients/RetryHandler.cs index 2df8d5acb..47440e35a 100644 --- a/src/Tes.ApiClients/RetryHandler.cs +++ b/src/Tes.ApiClients/RetryHandler.cs @@ -89,18 +89,18 @@ protected RetryHandler() { } /// public virtual AsyncRetryPolicy AsyncRetryPolicy => asyncRetryPolicy; - /// - /// Executes a delegate with the specified policy. - /// - /// Action to execute - /// - /// Result instance - public void ExecuteWithRetry(Action action, Context? context = default) - { - ArgumentNullException.ThrowIfNull(action); - - retryPolicy.Execute(_ => action(), context ?? new()); - } + ///// + ///// Executes a delegate with the specified policy. + ///// + ///// Action to execute + ///// + ///// Result instance + //public void ExecuteWithRetry(Action action, Context? context = default) + //{ + // ArgumentNullException.ThrowIfNull(action); + + // retryPolicy.Execute(_ => action(), context ?? new()); + //} /// /// Executes a delegate with the specified policy. @@ -116,19 +116,19 @@ public TResult ExecuteWithRetry(Func action, Context? context return retryPolicy.Execute(_ => action(), context ?? new()); } - /// - /// Executes a delegate with the specified async policy. - /// - /// Action to execute - /// - /// Result type - /// Result instance - public virtual Task ExecuteWithRetryAsync(Func> action, Context? context = default) - { - ArgumentNullException.ThrowIfNull(action); + ///// + ///// Executes a delegate with the specified async policy. + ///// + ///// Action to execute + ///// + ///// Result type + ///// Result instance + //public virtual Task ExecuteWithRetryAsync(Func> action, Context? context = default) + //{ + // ArgumentNullException.ThrowIfNull(action); - return asyncRetryPolicy.ExecuteAsync(_ => action(), context ?? new()); - } + // return asyncRetryPolicy.ExecuteAsync(_ => action(), context ?? new()); + //} /// /// Executes a delegate with the specified async policy. @@ -166,7 +166,7 @@ public async Task ExecuteWithRetryAsync(Func action, Ca /// A for controlling the lifetime of the asynchronous operation. /// /// Result HttpResponse - public virtual async Task ExecuteHttpRequestWithRetryAsync(Func> action, CancellationToken cancellationToken, Context? context = default) + public virtual async Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, Context? context = default) { ArgumentNullException.ThrowIfNull(action); @@ -176,10 +176,11 @@ public virtual async Task ExecuteHttpRequestWithRetryAsync( public static class RetryHandlerExtensions { - public static void SetOnRetryHandler(this Context context, Action, TimeSpan, int> onRetry) + public static void SetOnRetryHandler(this Context context, Action, TimeSpan, int> onretry) { - context[RetryHandler.OnRetryHandlerKey] = onRetry; + context[RetryHandler.OnRetryHandlerKey] = onretry; } + public static Action, TimeSpan, int>? GetOnRetryHandler(this Context context) { return context.TryGetValue(RetryHandler.OnRetryHandlerKey, out var handler) ? (Action, TimeSpan, int>)handler : default; diff --git a/src/Tes.ApiClients/TerraWsmApiClient.cs b/src/Tes.ApiClients/TerraWsmApiClient.cs index 966f25921..c22308491 100644 --- a/src/Tes.ApiClients/TerraWsmApiClient.cs +++ b/src/Tes.ApiClients/TerraWsmApiClient.cs @@ -181,10 +181,11 @@ public string GetDeleteBatchPoolUrl(Guid workspaceId, Guid wsmBatchPoolResourceI private async Task LogResponseContentAsync(HttpResponseMessage response, string errMessage, Exception ex, CancellationToken cancellationToken) { - var responseContent = ""; + var responseContent = string.Empty; + if (response is not null) { - responseContent = await response.Content.ReadAsStringAsync(cancellationToken); + responseContent = await ReadResponseBodyAsync(response, cancellationToken); } Logger.LogError(ex, $"{errMessage}. Response content:{responseContent}"); diff --git a/src/TesApi.Tests/BatchPoolTests.cs b/src/TesApi.Tests/BatchPoolTests.cs index ad114d6a7..bd3b91142 100644 --- a/src/TesApi.Tests/BatchPoolTests.cs +++ b/src/TesApi.Tests/BatchPoolTests.cs @@ -185,7 +185,6 @@ internal static AzureProxyReturnValues Get() internal Func AzureProxyGetComputeNodeAllocationState { get; set; } = null; internal Action AzureProxyDeleteBatchPool { get; set; } = (poolId, cancellationToken) => { }; internal Func> AzureProxyListTasks { get; set; } = (jobId, detailLevel) => AsyncEnumerable.Empty(); - internal Func EvaluateAutoScale { get; set; } //= new((poolId, autoscaleFormula) => AutoScaleRun); internal List VmSizesAndPrices { get; set; } = new(); internal static Func AzureProxyGetComputeNodeAllocationStateDefault = id => new(Microsoft.Azure.Batch.Common.AllocationState.Steady, DateTime.MinValue.ToUniversalTime(), true, 0, 0, 0, 0); @@ -381,7 +380,6 @@ private static Action> PrepareMockAzureProxy(AzureProxyReturnV azureProxy.Setup(a => a.GetBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())).Returns((string id, System.Threading.CancellationToken cancellationToken, DetailLevel detailLevel) => Task.FromResult(azureProxyReturnValues.GetBatchPoolImpl(id))); azureProxy.Setup(a => a.GetFullAllocationStateAsync(It.IsAny(), It.IsAny())).Returns((string poolId, System.Threading.CancellationToken _1) => Task.FromResult(GetPoolStateFromSettingStateOrDefault(poolId))); azureProxy.Setup(a => a.DeleteBatchPoolAsync(It.IsAny(), It.IsAny())).Callback((poolId, cancellationToken) => azureProxyReturnValues.AzureProxyDeleteBatchPoolImpl(poolId, cancellationToken)).Returns(Task.CompletedTask); - azureProxy.Setup(a => a.EvaluateAutoScaleAsync(It.IsAny(), It.IsAny(), It.IsAny())).Returns((string poolId, string autoscaleFormula, CancellationToken _1) => Task.FromResult(azureProxyReturnValues.EvaluateAutoScale(poolId, autoscaleFormula))); FullBatchPoolAllocationState GetPoolStateFromSettingStateOrDefault(string poolId) { diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 58c838c98..9063e1b76 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -549,9 +549,5 @@ public async Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, logger.LogDebug("Setting Pool {PoolID} to AutoScale({AutoScaleInterval}): '{AutoScaleFormula}'", poolId, interval, formula.Replace(Environment.NewLine, @"\n")); await batchClient.PoolOperations.EnableAutoScaleAsync(poolId, formula, interval, cancellationToken: cancellationToken); } - - /// - public Task EvaluateAutoScaleAsync(string poolId, string autoscaleFormula, CancellationToken cancellationToken) - => batchClient.PoolOperations.EvaluateAutoScaleAsync(poolId, autoscaleFormula, cancellationToken: cancellationToken); } } diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 80335740a..d0146a9f5 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -61,7 +61,7 @@ public BatchPool(IBatchScheduler batchScheduler, IOptions ResizeErrors { get; } = new(); private IAsyncEnumerable GetTasksAsync(string select, string filter) - => _azureProxy.ListTasksAsync(Id, new ODATADetailLevel { SelectClause = select, FilterClause = filter }); + => _removedFromService ? AsyncEnumerable.Empty() : _azureProxy.ListTasksAsync(Id, new ODATADetailLevel { SelectClause = select, FilterClause = filter }); internal IAsyncEnumerable GetTasksAsync(bool includeCompleted) => GetTasksAsync("id,stateTransitionTime", includeCompleted ? default : "state ne 'completed'"); @@ -491,6 +491,7 @@ private async ValueTask ServicePoolRemovePoolIfEmptyAsync(CancellationToken canc public sealed partial class BatchPool : IBatchPool { private static readonly SemaphoreSlim lockObj = new(1, 1); + private bool _removedFromService = false; /// /// Types of maintenance calls offered by the service method. @@ -527,6 +528,11 @@ internal enum ServiceKind /// public async ValueTask CanBeDeletedAsync(CancellationToken cancellationToken = default) { + if (_removedFromService) + { + return true; + } + if (await GetTasksAsync(includeCompleted: true).AnyAsync(cancellationToken)) { return false; @@ -569,7 +575,10 @@ internal async ValueTask ServicePoolAsync(ServiceKind serviceKind, CancellationT try // Don't add any code that can throw between this line and the call above to acquire lockObj. { - await func(cancellationToken); + if (!_removedFromService) + { + await func(cancellationToken); + } } finally { @@ -706,7 +715,7 @@ public IAsyncEnumerable GetCompletedTasksAsync(CancellationToken _1) /// public async ValueTask GetAllocationStateTransitionTimeAsync(CancellationToken cancellationToken = default) - => (await _azureProxy.GetBatchPoolAsync(Id, cancellationToken, new ODATADetailLevel { SelectClause = "allocationStateTransitionTime" })).AllocationStateTransitionTime ?? DateTime.UtcNow; + => (await _azureProxy.GetFullAllocationStateAsync(Id, cancellationToken)).AllocationStateTransitionTime ?? DateTime.UtcNow; /// public async ValueTask CreatePoolAndJobAsync(Microsoft.Azure.Management.Batch.Models.Pool poolModel, bool isPreemptible, CancellationToken cancellationToken) @@ -797,6 +806,12 @@ private void Configure(CloudPool pool, bool forceRemove) IsDedicated = bool.Parse(pool.Metadata.First(m => BatchScheduler.PoolIsDedicated.Equals(m.Name, StringComparison.Ordinal)).Value); _ = _batchPools.AddPool(this); } + + /// + public void MarkRemovedFromService() + { + _removedFromService = true; + } } /// diff --git a/src/TesApi.Web/BatchPools.BatchScheduler.cs b/src/TesApi.Web/BatchPools.BatchScheduler.cs index a16de041f..8f09c05ec 100644 --- a/src/TesApi.Web/BatchPools.BatchScheduler.cs +++ b/src/TesApi.Web/BatchPools.BatchScheduler.cs @@ -176,7 +176,18 @@ public IEnumerable GetPools() /// public bool RemovePoolFromList(IBatchPool pool) - => batchPools.Remove(pool); + { + pool.MarkRemovedFromService(); + + try + { + return batchPools.Remove(pool); + } + catch (InvalidOperationException) + { + return true; + } + } /// public async ValueTask FlushPoolsAsync(IEnumerable assignedPools, CancellationToken cancellationToken) diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 5243ec0c7..44ae70368 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -4,7 +4,6 @@ using System; using System.Collections.Generic; using System.Linq; -//using System.Runtime.CompilerServices; using System.Runtime.ExceptionServices; using System.Threading; using System.Threading.Tasks; @@ -12,11 +11,8 @@ using Microsoft.Azure.Batch.Common; using Microsoft.Extensions.Caching.Memory; using Microsoft.Extensions.Logging; -using Microsoft.Extensions.Options; using Polly; -using Polly.Retry; using Tes.ApiClients; -using Tes.ApiClients.Options; using TesApi.Web.Extensions; using TesApi.Web.Storage; using BatchModels = Microsoft.Azure.Management.Batch.Models; @@ -31,16 +27,14 @@ public class CachingWithRetriesAzureProxy : IAzureProxy private readonly ILogger logger; private readonly IAzureProxy azureProxy; private readonly CachingRetryHandler cachingRetryHandler; - private readonly AsyncRetryPolicy batchPoolOrJobCreateOrTaskAddHandler; /// /// Contructor to create a cache of /// /// - /// /// /// - public CachingWithRetriesAzureProxy(IAzureProxy azureProxy, IOptions retryPolicyOptions, CachingRetryHandler cachingRetryHandler, ILogger logger) + public CachingWithRetriesAzureProxy(IAzureProxy azureProxy, CachingRetryHandler cachingRetryHandler, ILogger logger) { ArgumentNullException.ThrowIfNull(azureProxy); ArgumentNullException.ThrowIfNull(cachingRetryHandler); @@ -48,26 +42,61 @@ public CachingWithRetriesAzureProxy(IAzureProxy azureProxy, IOptions + /// Rethrows exception if exception is and the Batch API call returned otherwise invokes . + /// + /// Polly retry handler. + /// Polly retry handler. + private static Action OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(Action OnRetry) + => new((outcome, timespan, retryCount) => { - BatchErrorCodeStrings.TaskExists, - BatchErrorCodeStrings.PoolExists, - BatchErrorCodeStrings.JobExists - }; - - batchPoolOrJobCreateOrTaskAddHandler = Policy - .Handle() - .WaitAndRetryAsync(retryPolicyOptions.Value.MaxRetryCount, - (attempt) => TimeSpan.FromSeconds(Math.Pow(retryPolicyOptions.Value.ExponentialBackOffExponent, attempt)), - (exception, timeSpan) => - { - if (exception is BatchException batchException && creationErrorFoundCodes.Contains(batchException.RequestInformation?.BatchError?.Code, StringComparer.OrdinalIgnoreCase)) - { - ExceptionDispatchInfo.Capture(exception).Throw(); - } - }); - } + if (outcome is BatchException batchException && CreationErrorFoundCodes.Contains(batchException.RequestInformation?.BatchError?.Code, StringComparer.OrdinalIgnoreCase)) + { + ExceptionDispatchInfo.Capture(outcome).Throw(); + } + + OnRetry?.Invoke(outcome, timespan, retryCount); + }); + + /// + /// Rethrows exception if exception is and the Batch API call returned otherwise invokes . + /// + /// Polly retry handler. + /// Polly retry handler. + private static Action OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(Action OnRetry) + => new((outcome, timespan, retryCount) => + { + if (outcome is BatchException batchException && batchException.RequestInformation.HttpStatusCode == System.Net.HttpStatusCode.NotFound) + { + ExceptionDispatchInfo.Capture(outcome).Throw(); + } + + OnRetry?.Invoke(outcome, timespan, retryCount); + }); + + private Action LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string caller = default) + => new((exception, retryCount, timeSpan) => + { + logger?.LogError(exception, @"Retrying {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan}", caller, retryCount, timeSpan); + }); + + //private Action, TimeSpan, int> LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string caller = default) + // => new((result, retryCount, timeSpan) => + // { + // if (result.Exception is not null) + // { + // logger?.LogError(result.Exception, @"Retrying {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan}", caller, retryCount, timeSpan); + // } + // }); /// @@ -75,7 +104,9 @@ public async Task CreateBatchJobAsync(string jobId, CancellationToken cancellati { try { - await batchPoolOrJobCreateOrTaskAddHandler.ExecuteAsync(ct => azureProxy.CreateBatchJobAsync(jobId, ct), cancellationToken); + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(LogRetryErrorOnRetryHandler())); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.CreateBatchJobAsync(jobId, ct), cancellationToken, ctx); } catch (BatchException exc) when (BatchErrorCodeStrings.JobExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } @@ -86,19 +117,19 @@ public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, strin { try { - await batchPoolOrJobCreateOrTaskAddHandler.ExecuteAsync(ct => azureProxy.AddBatchTaskAsync(tesTaskId, cloudTask, jobId, ct), cancellationToken); + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(LogRetryErrorOnRetryHandler())); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.AddBatchTaskAsync(tesTaskId, cloudTask, jobId, ct), cancellationToken, ctx); } catch (BatchException exc) when (BatchErrorCodeStrings.TaskExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } } - //private void LogRetryError(Exception exception, int retryCount, TimeSpan timeSpan, string message, [CallerMemberName] string caller = default) { } - /// public Task DeleteBatchJobAsync(string jobId, CancellationToken cancellationToken) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying DeleteBatchJobAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(jobId, ct), cancellationToken, ctx); } @@ -106,7 +137,7 @@ public Task DeleteBatchJobAsync(string jobId, CancellationToken cancellationToke public Task DeleteBatchTaskAsync(string tesTaskId, string jobId, CancellationToken cancellationToken) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying DeleteBatchTaskAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(tesTaskId, jobId, ct), cancellationToken, ctx); } @@ -114,7 +145,7 @@ public Task DeleteBatchTaskAsync(string tesTaskId, string jobId, CancellationTok public Task TerminateBatchTaskAsync(string tesTaskId, string jobId, CancellationToken cancellationToken) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying TerminateBatchTaskAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.TerminateBatchTaskAsync(tesTaskId, jobId, ct), cancellationToken, ctx); } @@ -122,7 +153,7 @@ public Task TerminateBatchTaskAsync(string tesTaskId, string jobId, Cancellation public Task DeleteBatchPoolAsync(string poolId, CancellationToken cancellationToken) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying DeleteBatchPoolAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchPoolAsync(poolId, ct), cancellationToken, ctx); } @@ -130,7 +161,7 @@ public Task DeleteBatchPoolAsync(string poolId, CancellationToken cancellationTo public Task GetBatchPoolAsync(string poolId, CancellationToken cancellationToken, DetailLevel detailLevel) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetBatchPoolAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchPoolAsync(poolId, ct, detailLevel), cancellationToken, ctx); } @@ -138,7 +169,7 @@ public Task GetBatchPoolAsync(string poolId, CancellationToken cancel public Task GetBatchJobAsync(string jobId, CancellationToken cancellationToken, DetailLevel detailLevel) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetBatchJobAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAsync(jobId, ct, detailLevel), cancellationToken, ctx); } @@ -147,7 +178,7 @@ public async Task DeleteBatchComputeNodesAsync(string poolId, IEnumerable logger.LogError(outcome, "Retrying DeleteBatchComputeNodesAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchComputeNodesAsync(poolId, computeNodes, ct), cancellationToken, ctx); } @@ -155,7 +186,7 @@ public async Task DeleteBatchComputeNodesAsync(string poolId, IEnumerable DownloadBlobAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying DownloadBlobAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DownloadBlobAsync(blobAbsoluteUri, ct), cancellationToken, ctx); } @@ -163,7 +194,7 @@ public Task DownloadBlobAsync(Uri blobAbsoluteUri, CancellationToken can public Task BlobExistsAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying BlobExistsAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.BlobExistsAsync(blobAbsoluteUri, ct), cancellationToken, ctx); } @@ -171,7 +202,7 @@ public Task BlobExistsAsync(Uri blobAbsoluteUri, CancellationToken cancell public IAsyncEnumerable GetActivePoolsAsync(string hostName) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetActivePoolsAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetActivePoolsAsync(hostName), ctx); } @@ -180,7 +211,7 @@ public IAsyncEnumerable GetActivePoolsAsync(string hostName) public int GetBatchActiveJobCount() { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetBatchActiveJobCount ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActiveJobCount, ctx); } @@ -188,7 +219,7 @@ public int GetBatchActiveJobCount() public IEnumerable GetBatchActiveNodeCountByVmSize() { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetBatchActiveNodeCountByVmSize ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActiveNodeCountByVmSize, ctx); } @@ -196,7 +227,7 @@ public IEnumerable GetBatchActiveNodeCountByVmSize() public int GetBatchActivePoolCount() { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetBatchActivePoolCount ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActivePoolCount, ctx); } @@ -204,7 +235,7 @@ public int GetBatchActivePoolCount() public Task GetStorageAccountKeyAsync(StorageAccountInfo storageAccountInfo, CancellationToken cancellationToken) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetStorageAccountKeyAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetryAndCachingAsync($"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountInfo.Id}", ct => azureProxy.GetStorageAccountKeyAsync(storageAccountInfo, ct), DateTimeOffset.Now.AddHours(1), cancellationToken, ctx); } @@ -217,7 +248,7 @@ public async Task GetStorageAccountInfoAsync(string storageA if (storageAccountInfo is null) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetStorageAccountInfoAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); storageAccountInfo = await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetStorageAccountInfoAsync(storageAccountName, ct), cancellationToken, ctx); if (storageAccountInfo is not null) @@ -233,7 +264,7 @@ public async Task GetStorageAccountInfoAsync(string storageA public IAsyncEnumerable ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying ListBlobsAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListBlobsAsync(directoryUri, cancellationToken), cachingRetryHandler.RetryPolicy, ctx); } @@ -241,7 +272,7 @@ public IAsyncEnumerable ListBlobsAsync(Uri directoryUri, Cancell public Task UploadBlobAsync(Uri blobAbsoluteUri, string content, CancellationToken cancellationToken) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying UploadBlobAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobAsync(blobAbsoluteUri, content, ct), cancellationToken); } @@ -249,7 +280,7 @@ public Task UploadBlobAsync(Uri blobAbsoluteUri, string content, CancellationTok public Task UploadBlobFromFileAsync(Uri blobAbsoluteUri, string filePath, CancellationToken cancellationToken) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying UploadBlobFromFileAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobFromFileAsync(blobAbsoluteUri, filePath, ct), cancellationToken, ctx); } @@ -257,7 +288,7 @@ public Task UploadBlobFromFileAsync(Uri blobAbsoluteUri, string filePath, Cancel public Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetBlobPropertiesAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBlobPropertiesAsync(blobAbsoluteUri, ct), cancellationToken, ctx); } @@ -275,7 +306,9 @@ public async Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, boo { try { - return await batchPoolOrJobCreateOrTaskAddHandler.ExecuteAsync(ct => azureProxy.CreateBatchPoolAsync(poolInfo, isPreemptable, ct), cancellationToken); + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(LogRetryErrorOnRetryHandler())); + return await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.CreateBatchPoolAsync(poolInfo, isPreemptable, ct), cancellationToken, ctx); } catch (BatchException exc) when (BatchErrorCodeStrings.PoolExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { @@ -287,11 +320,11 @@ public async Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, boo public Task GetFullAllocationStateAsync(string poolId, CancellationToken cancellationToken) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying GetFullAllocationStateAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetryAndCachingAsync( $"{nameof(CachingWithRetriesAzureProxy)}:{poolId}", ct => azureProxy.GetFullAllocationStateAsync(poolId, ct), - DateTimeOffset.Now.Add(BatchPoolService.RunInterval).Subtract(TimeSpan.FromSeconds(1)), + DateTimeOffset.UtcNow.Add(BatchPoolService.RunInterval.Divide(2)), cancellationToken, ctx); } @@ -299,39 +332,54 @@ public Task GetFullAllocationStateAsync(string poo /// public IAsyncEnumerable ListComputeNodesAsync(string poolId, DetailLevel detailLevel) { - var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying ListComputeNodesAsync ({RetryCount}).", retryCount)); - return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListComputeNodesAsync(poolId, detailLevel), cachingRetryHandler.RetryPolicy, ctx); + try + { + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(LogRetryErrorOnRetryHandler())); + return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListComputeNodesAsync(poolId, detailLevel), cachingRetryHandler.RetryPolicy, ctx); + } + catch (BatchException exc) when (exc.RequestInformation.HttpStatusCode == System.Net.HttpStatusCode.NotFound) + { + return AsyncEnumerable.Empty(); + } } /// public IAsyncEnumerable ListTasksAsync(string jobId, DetailLevel detailLevel) { - var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying ListTasksAsync ({RetryCount}).", retryCount)); - return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListTasksAsync(jobId, detailLevel), cachingRetryHandler.RetryPolicy, ctx); + try + { + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(LogRetryErrorOnRetryHandler())); + return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListTasksAsync(jobId, detailLevel), cachingRetryHandler.RetryPolicy, ctx); + } + catch (BatchException exc) when (exc.RequestInformation.HttpStatusCode == System.Net.HttpStatusCode.NotFound) + { + return AsyncEnumerable.Empty(); + } } /// - public Task DisableBatchPoolAutoScaleAsync(string poolId, CancellationToken cancellationToken) => azureProxy.DisableBatchPoolAutoScaleAsync(poolId, cancellationToken); + public Task DisableBatchPoolAutoScaleAsync(string poolId, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DisableBatchPoolAutoScaleAsync(poolId, ct), cancellationToken, ctx); + } /// public Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeSpan interval, IAzureProxy.BatchPoolAutoScaleFormulaFactory formulaFactory, Func> currentTargetFunc, CancellationToken cancellationToken) - => azureProxy.EnableBatchPoolAutoScaleAsync(poolId, preemptable, interval, formulaFactory, currentTargetFunc, cancellationToken); - - /// - public Task EvaluateAutoScaleAsync(string poolId, string autoscaleFormula, CancellationToken cancellationToken) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying EvaluateAutoScaleAsync ({RetryCount}).", retryCount)); - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.EvaluateAutoScaleAsync(poolId, autoscaleFormula, ct), cancellationToken, ctx); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.EnableBatchPoolAutoScaleAsync(poolId, preemptable, interval, formulaFactory, currentTargetFunc, ct), cancellationToken, ctx); } /// public IAsyncEnumerable ListBlobsWithTagsAsync(Uri containerUri, string prefix, CancellationToken cancellationToken) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying ListBlobsWithTagsAsync ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListBlobsWithTagsAsync(containerUri, prefix, cancellationToken), cachingRetryHandler.RetryPolicy, ctx); } @@ -339,7 +387,7 @@ public Task EvaluateAutoScaleAsync(string poolId, string autoscale public Task SetBlobTags(Uri blobAbsoluteUri, IDictionary tags, CancellationToken cancellationToken) { var ctx = new Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => logger.LogError(outcome, "Retrying SetBlobTags ({RetryCount}).", retryCount)); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.SetBlobTags(blobAbsoluteUri, tags, ct), cancellationToken, ctx); } } diff --git a/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs b/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs index 541ed3af5..c75b30471 100644 --- a/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs +++ b/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs @@ -65,7 +65,8 @@ public static IAsyncEnumerable ExecuteAsync(this AsyncRetryPolicy asyncRet ArgumentNullException.ThrowIfNull(func); ArgumentNullException.ThrowIfNull(retryPolicy); - return new PollyAsyncEnumerable(retryPolicy.Execute(_ => func(), ctx ??= new()), asyncRetryPolicy, ctx); + ctx ??= new(); + return new PollyAsyncEnumerable(retryPolicy.Execute(_ => func(), ctx), asyncRetryPolicy, ctx); } #region Implementation classes @@ -145,7 +146,7 @@ ValueTask IAsyncDisposable.DisposeAsync() => _source.DisposeAsync(); ValueTask IAsyncEnumerator.MoveNextAsync() - => new(_retryPolicy.ExecuteAsync((ctx, ct) => _source.MoveNextAsync(ct).AsTask(), _ctx, _cancellationToken)); + => new(_retryPolicy.ExecuteAsync((_, ct) => _source.MoveNextAsync(ct).AsTask(), new(_ctx.OperationKey, _ctx), _cancellationToken)); } private sealed class PageEnumerator : EnumeratorEnumerator> diff --git a/src/TesApi.Web/IAzureProxy.cs b/src/TesApi.Web/IAzureProxy.cs index aa560421e..9355b0fba 100644 --- a/src/TesApi.Web/IAzureProxy.cs +++ b/src/TesApi.Web/IAzureProxy.cs @@ -271,22 +271,6 @@ public interface IAzureProxy /// Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeSpan interval, BatchPoolAutoScaleFormulaFactory formulaFactory, Func> currentTargetFunc, CancellationToken cancellationToken); - /// - /// Gets the result of evaluating an automatic scaling formula on the specified pool. This - /// is primarily for validating an autoscale formula, as it simply returns the result - /// without applying the formula to the pool. - /// - /// The id of the pool. - /// The formula to be evaluated on the pool. - /// A for controlling the lifetime of the asynchronous operation. - /// The result of evaluating the on the specified pool. - /// - /// The formula is validated and its results calculated, but is not applied to the pool. To apply the formula to the pool, use . - /// This method does not change any state of the pool, and does not affect the or . - /// The evaluate operation runs asynchronously. - /// - Task EvaluateAutoScaleAsync(string poolId, string autoscaleFormula, CancellationToken cancellationToken); - /// /// Describes a function to generate autoscale formulas /// diff --git a/src/TesApi.Web/IBatchPool.cs b/src/TesApi.Web/IBatchPool.cs index 92e5db5e9..9bf97e43a 100644 --- a/src/TesApi.Web/IBatchPool.cs +++ b/src/TesApi.Web/IBatchPool.cs @@ -46,6 +46,11 @@ public interface IBatchPool /// ValueTask CanBeDeletedAsync(CancellationToken cancellationToken = default); + /// + /// Indicates that the pool will no longer be serviced. + /// + void MarkRemovedFromService(); + /// /// Updates this instance based on changes to its environment. /// diff --git a/src/TesApi.Web/Management/ContainerRegistryProvider.cs b/src/TesApi.Web/Management/ContainerRegistryProvider.cs index 4f29a8e53..a0515f1ab 100644 --- a/src/TesApi.Web/Management/ContainerRegistryProvider.cs +++ b/src/TesApi.Web/Management/ContainerRegistryProvider.cs @@ -99,7 +99,9 @@ public bool IsImagePublic(string imageName) private async Task LookUpAndAddToCacheContainerRegistryInfoAsync(string imageName, CancellationToken cancellationToken) { - var repositories = await CachingRetryHandler.ExecuteWithRetryAsync(GetAccessibleContainerRegistriesAsync, cancellationToken: cancellationToken); + var ctx = new Polly.Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount) => Logger.LogError(outcome, @"Retrying {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan}", nameof(LookUpAndAddToCacheContainerRegistryInfoAsync), retryCount, timespan)); + var repositories = await CachingRetryHandler.ExecuteWithRetryAsync(GetAccessibleContainerRegistriesAsync, cancellationToken: cancellationToken, context: ctx); var requestedRepo = repositories?.FirstOrDefault(reg => reg.RegistryServer.Equals(imageName.Split('/').FirstOrDefault(), StringComparison.OrdinalIgnoreCase)); From e17a1c377b832414d2b37d5a02de9e4cdedc3d54 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 3 Nov 2023 21:21:40 -0700 Subject: [PATCH 072/202] finalize featires, cleanup code --- .../Commands/ProcessLauncherTests.cs | 2 +- src/Tes.Runner.Test/ExecutorTests.cs | 4 +-- .../Transfer/BlobApiHttpUtilsTests.cs | 2 +- src/Tes.Runner/CompletedUploadFile.cs | 13 +++++++++ src/Tes.Runner/Events/EventsPublisher.cs | 4 +-- src/Tes.Runner/Executor.cs | 4 +-- src/Tes.Runner/Transfer/BlobUploader.cs | 4 +-- src/Tes/Models/TesState.cs | 2 +- src/Tes/Models/TesTaskExtended.cs | 2 +- src/TesApi.Web/AzureBatchTaskState.cs | 18 +----------- .../Events/RunnerEventsProcessor.cs | 29 ++++++++++++------- 11 files changed, 44 insertions(+), 40 deletions(-) create mode 100644 src/Tes.Runner/CompletedUploadFile.cs diff --git a/src/Tes.Runner.Test/Commands/ProcessLauncherTests.cs b/src/Tes.Runner.Test/Commands/ProcessLauncherTests.cs index 69faea19e..aaa0b84fb 100644 --- a/src/Tes.Runner.Test/Commands/ProcessLauncherTests.cs +++ b/src/Tes.Runner.Test/Commands/ProcessLauncherTests.cs @@ -12,7 +12,7 @@ namespace Tes.Runner.Test.Commands public class ProcessLauncherTests { private ProcessLauncher processLauncher = null!; - private Mock streamLogReaderMock = null!; + private Mock streamLogReaderMock; [TestInitialize] public void SetUp() diff --git a/src/Tes.Runner.Test/ExecutorTests.cs b/src/Tes.Runner.Test/ExecutorTests.cs index 8d660b486..94d243fba 100644 --- a/src/Tes.Runner.Test/ExecutorTests.cs +++ b/src/Tes.Runner.Test/ExecutorTests.cs @@ -135,7 +135,7 @@ public async Task UploadOutputsAsync_NoOutputProvided_StartSuccessEventsAreCreat var result = await executor.UploadOutputsAsync(blobPipelineOptions); Assert.AreEqual(Executor.ZeroBytesTransferred, result); eventsPublisherMock.Verify(p => p.PublishUploadStartEventAsync(It.IsAny()), Times.Once); - eventsPublisherMock.Verify(p => p.PublishUploadEndEventAsync(It.IsAny(), 0, 0, EventsPublisher.SuccessStatus, string.Empty, null), Times.Once); + eventsPublisherMock.Verify(p => p.PublishUploadEndEventAsync(It.IsAny(), 0, 0, EventsPublisher.SuccessStatus, string.Empty, It.IsAny?>()), Times.Once); } [TestMethod] @@ -144,7 +144,7 @@ public async Task UploadOutputAsync_NullOptionsThrowsError_StartFailureEventsAre await Assert.ThrowsExceptionAsync(() => executor.UploadOutputsAsync(null!)); eventsPublisherMock.Verify(p => p.PublishUploadStartEventAsync(It.IsAny()), Times.Once); - eventsPublisherMock.Verify(p => p.PublishUploadEndEventAsync(It.IsAny(), 0, 0, EventsPublisher.FailedStatus, It.Is((c) => !string.IsNullOrEmpty(c)), null), Times.Once); + eventsPublisherMock.Verify(p => p.PublishUploadEndEventAsync(It.IsAny(), 0, 0, EventsPublisher.FailedStatus, It.Is((c) => !string.IsNullOrEmpty(c)), It.IsAny?>()), Times.Once); } [TestMethod] diff --git a/src/Tes.Runner.Test/Transfer/BlobApiHttpUtilsTests.cs b/src/Tes.Runner.Test/Transfer/BlobApiHttpUtilsTests.cs index 268500041..08af190a6 100644 --- a/src/Tes.Runner.Test/Transfer/BlobApiHttpUtilsTests.cs +++ b/src/Tes.Runner.Test/Transfer/BlobApiHttpUtilsTests.cs @@ -246,7 +246,7 @@ public async Task CreatePutBlobRequestAsync_ValidInput_ExpectedRequestIsCreated( } else { - Assert.AreEqual(expectedContent, await request?.Content?.ReadAsStringAsync()!); + Assert.AreEqual(expectedContent, await request?.Content?.ReadAsStringAsync()); } diff --git a/src/Tes.Runner/CompletedUploadFile.cs b/src/Tes.Runner/CompletedUploadFile.cs new file mode 100644 index 000000000..671d5cde4 --- /dev/null +++ b/src/Tes.Runner/CompletedUploadFile.cs @@ -0,0 +1,13 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace Tes.Runner +{ + /// + /// Upload File Log Entry + /// + /// Size of file in bytes. + /// Target URL + /// Source Path + public record struct CompletedUploadFile(long Length, Uri? BlobUrl, string FileName); +} diff --git a/src/Tes.Runner/Events/EventsPublisher.cs b/src/Tes.Runner/Events/EventsPublisher.cs index 11cb9388d..320c444e3 100644 --- a/src/Tes.Runner/Events/EventsPublisher.cs +++ b/src/Tes.Runner/Events/EventsPublisher.cs @@ -93,7 +93,7 @@ public virtual async Task PublishUploadStartEventAsync(NodeTask nodeTask) await PublishAsync(eventMessage); } - public virtual async Task PublishUploadEndEventAsync(NodeTask nodeTask, int numberOfFiles, long totalSizeInBytes, string statusMessage, string? errorMessage = default, IEnumerable<(long Length, Uri? BlobUrl, string FileName)>? completedFiles = default) + public virtual async Task PublishUploadEndEventAsync(NodeTask nodeTask, int numberOfFiles, long totalSizeInBytes, string statusMessage, string? errorMessage = default, IEnumerable? completedFiles = default) { var eventMessage = CreateNewEventMessage(nodeTask.Id, UploadEndEvent, statusMessage, nodeTask.WorkflowId); @@ -105,7 +105,7 @@ public virtual async Task PublishUploadEndEventAsync(NodeTask nodeTask, int numb { "errorMessage", errorMessage??string.Empty} }; - foreach (var (length, blobUrl, fileName, index) in completedFiles?.Select((item, index) => (item.Length, item.BlobUrl, item.FileName, index)) ?? Enumerable.Empty<(long, Uri?, string, int)>()) + foreach (var (length, blobUrl, fileName, index) in completedFiles?.Select((logEntry, index) => (logEntry.Length, logEntry.BlobUrl, logEntry.FileName, index)) ?? Enumerable.Empty<(long, Uri?, string, int)>()) { eventMessage.EventData.Add($"fileSize-{index}", length.ToString()); eventMessage.EventData.Add($"fileUri-{index}", blobUrl?.ToString() ?? string.Empty); diff --git a/src/Tes.Runner/Executor.cs b/src/Tes.Runner/Executor.cs index 1af4d7304..e2ff8232a 100644 --- a/src/Tes.Runner/Executor.cs +++ b/src/Tes.Runner/Executor.cs @@ -96,7 +96,7 @@ public async Task UploadOutputsAsync(BlobPipelineOptions blobPipelineOptio var bytesTransferred = ZeroBytesTransferred; var numberOfOutputs = 0; var errorMessage = string.Empty; - IEnumerable<(long Length, Uri? BlobUrl, string FileName)>? completedFiles = default; + IEnumerable? completedFiles = default; try { @@ -140,7 +140,7 @@ public async Task UploadOutputsAsync(BlobPipelineOptions blobPipelineOptio } } - private async Task<(long BytesTransferred, IEnumerable<(long length, Uri? blobUrl, string fileName)> CompletedFiles)> UploadOutputsAsync(BlobPipelineOptions blobPipelineOptions, List outputs) + private async Task<(long BytesTransferred, IEnumerable CompletedFiles)> UploadOutputsAsync(BlobPipelineOptions blobPipelineOptions, List outputs) { var uploader = await transferOperationFactory.CreateBlobUploaderAsync(blobPipelineOptions); diff --git a/src/Tes.Runner/Transfer/BlobUploader.cs b/src/Tes.Runner/Transfer/BlobUploader.cs index 8d06ae3e1..6804536f3 100644 --- a/src/Tes.Runner/Transfer/BlobUploader.cs +++ b/src/Tes.Runner/Transfer/BlobUploader.cs @@ -14,7 +14,7 @@ public class BlobUploader : BlobOperationPipeline { private readonly ConcurrentDictionary hashListProviders = new(); - internal readonly ConcurrentBag<(long length, Uri? blobUrl, string fileName)> CompletedFiles = new(); + internal readonly ConcurrentBag CompletedFiles = new(); public BlobUploader(BlobPipelineOptions pipelineOptions, Channel memoryBufferPool) : base(pipelineOptions, memoryBufferPool) { @@ -143,7 +143,7 @@ public override async Task OnCompletionAsync(long length, Uri? blobUrl, string f } finally { - CompletedFiles.Add((length, blobUrl, fileName)); + CompletedFiles.Add(new(length, blobUrl, fileName)); response?.Dispose(); } } diff --git a/src/Tes/Models/TesState.cs b/src/Tes/Models/TesState.cs index 6ac35977d..bd4695bc3 100644 --- a/src/Tes/Models/TesState.cs +++ b/src/Tes/Models/TesState.cs @@ -99,6 +99,6 @@ public enum TesState /// Enum CANCELINGEnum for CANCELING /// [EnumMember(Value = "CANCELING")] - CANCELINGEnum = 11 + CANCELINGEnum = 11, } } diff --git a/src/Tes/Models/TesTaskExtended.cs b/src/Tes/Models/TesTaskExtended.cs index 357f4e495..9d8954cf1 100644 --- a/src/Tes/Models/TesTaskExtended.cs +++ b/src/Tes/Models/TesTaskExtended.cs @@ -21,7 +21,7 @@ public partial class TesTask : RepositoryItem TesState.RUNNINGEnum, TesState.PAUSEDEnum, TesState.INITIALIZINGEnum, - TesState.CANCELINGEnum + TesState.CANCELINGEnum, }; /// diff --git a/src/TesApi.Web/AzureBatchTaskState.cs b/src/TesApi.Web/AzureBatchTaskState.cs index b7be1a3d5..e836faf7f 100644 --- a/src/TesApi.Web/AzureBatchTaskState.cs +++ b/src/TesApi.Web/AzureBatchTaskState.cs @@ -120,23 +120,7 @@ public record struct OutputFileLog(Uri Url, string Path, long Size); /// /// Failure code. Intended to be machine readable. See . /// Failure details to be added to . - public record struct FailureInformation(string Reason, IEnumerable SystemLogs) - { - - /// - /// Failure details to be added to . - /// - public IEnumerable SystemLogs { get; private set; } = SystemLogs; - - /// - /// Adds additional logs to . - /// - /// Additional logs to add. - public void AppendRangeToSystemLogs(IEnumerable additionalLogs) - { - SystemLogs = SystemLogs.Concat(additionalLogs); - } - } + public record struct FailureInformation(string Reason, IEnumerable SystemLogs); /// /// SystemLog-appending copy constructor diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index 220ae6d3b..3cfc75948 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -207,7 +207,7 @@ public IEnumerable OrderProcessedByExecutorSequence(IEnumerable source, return source.OrderBy(t => OrderBy(messageGetter(t))).ThenBy(t => ThenBy(messageGetter(t))); static DateTime OrderBy(RunnerEventsMessage message) - => message.RunnerEventMessage?.Created ?? DateTime.Parse(message.Tags["created"]).ToUniversalTime(); + => (message.RunnerEventMessage?.Created ?? DateTime.Parse(message.Tags["created"])).ToUniversalTime(); static int ThenBy(RunnerEventsMessage message) => ParseEventName(message.RunnerEventMessage is null @@ -246,7 +246,8 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes Failure: new("SystemError", Enumerable.Empty() .Append("Download failed.") - .Append(nodeMessage.EventData["errorMessage"]))), + .Append(nodeMessage.EventData["errorMessage"]) + .Concat(await AddProcessLogsIfAvailable(nodeMessage, tesTask, cancellationToken)))), _ => throw new System.Diagnostics.UnreachableException(), }, @@ -265,7 +266,8 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes AzureBatchTaskState.TaskState.InfoUpdate, Failure: new("ExecutorError", Enumerable.Empty() - .Append(nodeMessage.EventData["errorMessage"])), + .Append(nodeMessage.EventData["errorMessage"]) + .Concat(await AddProcessLogsIfAvailable(nodeMessage, tesTask, cancellationToken))), ExecutorEndTime: nodeMessage.Created, ExecutorExitCode: int.Parse(nodeMessage.EventData["exitCode"])), @@ -285,7 +287,8 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes Failure: new("SystemError", Enumerable.Empty() .Append("Upload failed.") - .Append(nodeMessage.EventData["errorMessage"]))), + .Append(nodeMessage.EventData["errorMessage"]) + .Concat(await AddProcessLogsIfAvailable(nodeMessage, tesTask, cancellationToken)))), _ => throw new System.Diagnostics.UnreachableException(), }, @@ -312,17 +315,21 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes _ => throw new System.Diagnostics.UnreachableException(), }; - var processLogs = await GetProcessLogs(nodeMessage, tesTask, cancellationToken).ToListAsync(cancellationToken); + return state; - if (processLogs.Any()) + // Helpers + async ValueTask> AddProcessLogsIfAvailable(Tes.Runner.Events.EventMessage message, Tes.Models.TesTask tesTask, CancellationToken cancellationToken) { - processLogs.Insert(0, "Possibly relevant logs:"); - state.Failure?.AppendRangeToSystemLogs(processLogs); - } + var processLogs = await GetProcessLogs(message, tesTask, cancellationToken).ToListAsync(cancellationToken); - return state; + if (processLogs.Any()) + { + processLogs.Insert(0, "Possibly relevant logs:"); + } + + return processLogs; + } - // Helpers static IEnumerable GetFileLogs(IDictionary eventData) { const string marker = "/wd/"; From bb3a13775acc4a90ae119215b8a3de3b5a680794 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 6 Nov 2023 16:24:24 -0800 Subject: [PATCH 073/202] Address exceptions and determine cause of task deletion failures --- src/Tes.ApiClients/HttpApiClient.cs | 39 +++++--- src/Tes.ApiClients/RetryHandler.cs | 30 +++--- src/TesApi.Tests/BatchSchedulerTests.cs | 6 +- src/TesApi.Web/AzureProxy.cs | 93 ++++++++++++++----- src/TesApi.Web/BatchScheduler.cs | 11 +-- .../CachingWithRetriesAzureProxy.cs | 86 ++++++++++------- .../Management/ContainerRegistryProvider.cs | 3 +- 7 files changed, 171 insertions(+), 97 deletions(-) diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index 9a7724eda..03b2cd3b1 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -69,22 +69,33 @@ protected HttpApiClient(TokenCredential tokenCredential, string tokenScope, /// protected HttpApiClient() { } - private Action LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string? caller = default) - => new((exception, retryCount, timeSpan) => + /// + /// A logging Polly retry handler. + /// + /// Calling method name. + /// + private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string? caller = default) + => new((exception, timeSpan, retryCount, correlationId) => { - Logger?.LogError(exception, @"Retrying {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan}", caller, retryCount, timeSpan); + Logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan} CorrelationId: {CorrelationId:D}", caller, retryCount, timeSpan, correlationId); }); - private Action, TimeSpan, int> LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string? caller = default) - => new((result, retryCount, timeSpan) => + /// + /// A logging Polly retry handler. + /// + /// See + /// Calling method name. + /// + private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string? caller = default) + => new((result, timeSpan, retryCount, correlationId) => { if (result.Exception is null) { - Logger?.LogError(@"Retrying {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan}", caller, retryCount, timeSpan); + Logger?.LogError(@"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan} CorrelationId: {CorrelationId:D}", caller, retryCount, timeSpan, correlationId); } else { - Logger?.LogError(result.Exception, @"Retrying {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan}", caller, retryCount, timeSpan); + Logger?.LogError(result.Exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan} CorrelationId: {CorrelationId:D}", caller, retryCount, timeSpan, correlationId); } }); @@ -230,8 +241,8 @@ protected static string AppendQueryStringParams(params string[] arguments) return string.Empty; } - var queryString = ""; - var prefix = ""; + var queryString = string.Empty; + var prefix = string.Empty; foreach (var argument in arguments) { @@ -272,7 +283,7 @@ private async Task AddAuthorizationHeaderToRequestAsync(HttpRequestMessage reque nameof(tokenScope)); } - Logger.LogTrace("Getting token for scope:{}", tokenScope); + Logger.LogTrace("Getting token for scope:{TokenScope}", tokenScope); try { @@ -282,8 +293,8 @@ private async Task AddAuthorizationHeaderToRequestAsync(HttpRequestMessage reque } catch (Exception e) { - Logger.LogError(@"Failed to set authentication header with the access token for scope:{tokenScope}", - e); + Logger.LogError(e, @"Failed to set authentication header with the access token for scope:{TokenScope}", + tokenScope); throw; } } @@ -297,7 +308,7 @@ private async Task GetOrRefreshAccessTokenAsync(CancellationToken cancel if (DateTimeOffset.UtcNow < accessToken.ExpiresOn) { Logger.LogTrace( - $"Using existing token. Token has not expired. Token expiration date: {accessToken.ExpiresOn}"); + @"Using existing token. Token has not expired. Token expiration date: {TokenExpiresOn}", accessToken.ExpiresOn); return accessToken.Token; } @@ -305,7 +316,7 @@ private async Task GetOrRefreshAccessTokenAsync(CancellationToken cancel new TokenRequestContext(new[] { tokenScope }), cancellationToken); - Logger.LogTrace($"Returning a new token with an expiration date of: {newAccessToken.ExpiresOn}"); + Logger.LogTrace(@"Returning a new token with an expiration date of: {TokenExpiresOn}", newAccessToken.ExpiresOn); accessToken = newAccessToken; return accessToken.Token; } diff --git a/src/Tes.ApiClients/RetryHandler.cs b/src/Tes.ApiClients/RetryHandler.cs index 47440e35a..f8d378b65 100644 --- a/src/Tes.ApiClients/RetryHandler.cs +++ b/src/Tes.ApiClients/RetryHandler.cs @@ -30,18 +30,20 @@ public class RetryHandler /// The handled exception. /// The current sleep duration. /// The current retry count. It starts at 1 between the first handled condition and the first wait, then 2, etc. + /// A Guid guaranteed to be unique to each execution. Acts as a correlation id so that events specific to a single execution can be identified in logging and telemetry. /// This is called right before the wait. - public delegate void OnRetryHandler(Exception outcome, TimeSpan timespan, int retryCount); + public delegate void OnRetryHandler(Exception outcome, TimeSpan timespan, int retryCount, Guid correlationId); /// /// The action to call on each retry. /// - /// See . + /// See . /// The handled exception or result. /// The current sleep duration. /// The current retry count. It starts at 1 between the first handled condition and the first wait, then 2, etc. + /// A Guid guaranteed to be unique to each execution. Acts as a correlation id so that events specific to a single execution can be identified in logging and telemetry. /// This is called right before the wait. - public delegate void OnRetryHandler(DelegateResult result, TimeSpan timespan, int retryCount); + public delegate void OnRetryHandler(DelegateResult result, TimeSpan timespan, int retryCount, Guid correlationId); /// /// Synchronous retry policy instance. @@ -69,14 +71,14 @@ public RetryHandler(IOptions retryPolicyOptions) attempt)), OnRetry); } - private static void OnRetry(DelegateResult result, TimeSpan span, int retryCount, Context ctx) + public static void OnRetry(DelegateResult result, TimeSpan span, int retryCount, Context ctx) { - ctx.GetOnRetryHandler()?.Invoke(result, span, retryCount); + ctx.GetOnRetryHandler()?.Invoke(result, span, retryCount, ctx.CorrelationId); } - private static void OnRetry(Exception outcome, TimeSpan timespan, int retryCount, Context ctx) + public static void OnRetry(Exception outcome, TimeSpan timespan, int retryCount, Context ctx) { - ctx.GetOnRetryHandler()?.Invoke(outcome, timespan, retryCount); + ctx.GetOnRetryHandler()?.Invoke(outcome, timespan, retryCount, ctx.CorrelationId); } /// @@ -176,23 +178,23 @@ public virtual async Task ExecuteWithRetryAsync(Func(this Context context, Action, TimeSpan, int> onretry) + public static void SetOnRetryHandler(this Context context, RetryHandler.OnRetryHandler onRetry) { - context[RetryHandler.OnRetryHandlerKey] = onretry; + context[RetryHandler.OnRetryHandlerKey] = onRetry; } - public static Action, TimeSpan, int>? GetOnRetryHandler(this Context context) + public static RetryHandler.OnRetryHandler? GetOnRetryHandler(this Context context) { - return context.TryGetValue(RetryHandler.OnRetryHandlerKey, out var handler) ? (Action, TimeSpan, int>)handler : default; + return context.TryGetValue(RetryHandler.OnRetryHandlerKey, out var handler) ? (RetryHandler.OnRetryHandler)handler : default; } - public static void SetOnRetryHandler(this Context context, Action onRetry) + public static void SetOnRetryHandler(this Context context, RetryHandler.OnRetryHandler onRetry) { context[RetryHandler.OnRetryHandlerKey] = onRetry; } - public static Action? GetOnRetryHandler(this Context context) + public static RetryHandler.OnRetryHandler? GetOnRetryHandler(this Context context) { - return context.TryGetValue(RetryHandler.OnRetryHandlerKey, out var handler) ? (Action)handler : default; + return context.TryGetValue(RetryHandler.OnRetryHandlerKey, out var handler) ? (RetryHandler.OnRetryHandler)handler : default; } } diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index ee05c6fbf..658fdae9a 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -852,7 +852,7 @@ public async Task TaskIsRequeuedUpToThreeTimesForTransientErrors() await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed))); await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); - await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed))); + await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed))); } [TestMethod] @@ -886,7 +886,7 @@ public async Task TaskGetsCancelled() var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELINGEnum }; var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - azureProxyReturnValues.BatchTaskState = BatchTaskStates.TaskPreparing; + azureProxyReturnValues.BatchTaskState = BatchTaskStates.Terminated; Mock azureProxy = default; var azureProxySetter = new Action>(mock => { @@ -1844,7 +1844,7 @@ private struct BatchTaskStates public static AzureBatchTaskState NodePreempted => new(AzureBatchTaskState.TaskState.NodePreempted); public static AzureBatchTaskState NodeDiskFull => new(AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, Failure: new("DiskFull", new[] { "Error message." })); - public static AzureBatchTaskState Terminated => new(AzureBatchTaskState.TaskState.CompletedWithErrors, CloudTaskCreationTime: DateTimeOffset.UtcNow - TimeSpan.FromMinutes(12)); + public static AzureBatchTaskState Terminated => new(AzureBatchTaskState.TaskState.CancellationRequested, CloudTaskCreationTime: DateTimeOffset.UtcNow - TimeSpan.FromMinutes(12)); //public static AzureBatchTaskState ActiveJobWithMissingAutoPool => new() { ActiveJobWithMissingAutoPool = true }; //public static AzureBatchTaskState ImageDownloadFailed => new() { JobState = JobState.Active, NodeErrorCode = "ContainerInvalidImage" }; } diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 9063e1b76..93c92a885 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -25,6 +25,8 @@ using Microsoft.Rest; using Polly; using Polly.Retry; +using Tes.ApiClients; +using Tes.ApiClients.Options; using TesApi.Web.Extensions; using TesApi.Web.Management.Batch; using TesApi.Web.Management.Configuration; @@ -39,14 +41,11 @@ namespace TesApi.Web /// /// Wrapper for Azure APIs /// + // TODO: Consider breaking the different sets of Azure APIs into their own classes. public partial class AzureProxy : IAzureProxy { private const char BatchJobAttemptSeparator = '-'; - private static readonly AsyncRetryPolicy batchRaceConditionJobNotFoundRetryPolicy = Policy - .Handle(ex => ex.RequestInformation.BatchError.Code == BatchErrorCodeStrings.JobNotFound) - .WaitAndRetryAsync(5, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt))); - - private readonly AsyncRetryPolicy batchNodeNotReadyRetryPolicy; + private readonly AsyncRetryPolicy batchRetryPolicy; private readonly ILogger logger; private readonly BatchClient batchClient; @@ -61,17 +60,20 @@ public partial class AzureProxy : IAzureProxy /// /// Constructor of AzureProxy /// + /// Retry policy options /// The Azure Batch Account options /// /// The logger /// - public AzureProxy(IOptions batchAccountOptions, IBatchPoolManager batchPoolManager, ILogger logger/*, Azure.Core.TokenCredential tokenCredential*/) + public AzureProxy(IOptions retryPolicyOptions, IOptions batchAccountOptions, IBatchPoolManager batchPoolManager, ILogger logger) { + ArgumentNullException.ThrowIfNull(retryPolicyOptions); ArgumentNullException.ThrowIfNull(batchAccountOptions); ArgumentNullException.ThrowIfNull(logger); ArgumentNullException.ThrowIfNull(batchPoolManager); this.batchPoolManager = batchPoolManager; + this.logger = logger; if (string.IsNullOrWhiteSpace(batchAccountOptions.Value.AccountName)) { @@ -79,20 +81,11 @@ public AzureProxy(IOptions batchAccountOptions, IBatchPoolM throw new InvalidOperationException("The batch account name is missing from the the configuration."); } - this.logger = logger; - - this.batchNodeNotReadyRetryPolicy = Policy - .Handle(ex => "NodeNotReady".Equals(ex.RequestInformation?.BatchError?.Code, StringComparison.InvariantCultureIgnoreCase)) - .WaitAndRetryAsync( - 5, - (retryAttempt, exception, _) => (exception as BatchException)?.RequestInformation?.RetryAfter ?? TimeSpan.FromSeconds(Math.Pow(2, retryAttempt)), - (exception, delay, retryAttempt, _) => - { - var requestId = (exception as BatchException)?.RequestInformation?.ServiceRequestId; - var reason = (exception.InnerException as Microsoft.Azure.Batch.Protocol.Models.BatchErrorException)?.Response?.ReasonPhrase; - this.logger.LogDebug(exception, "Retry attempt {RetryAttempt} after delay {DelaySeconds} for NodeNotReady exception: ServiceRequestId: {ServiceRequestId}, BatchErrorCode: NodeNotReady, Reason: {ReasonPhrase}", retryAttempt, delay.TotalSeconds, requestId, reason); - return Task.FromResult(false); - }); + batchRetryPolicy = Policy + .Handle() + .WaitAndRetryAsync(retryPolicyOptions.Value.MaxRetryCount, + (attempt, exception, ctx) => (exception as BatchException)?.RequestInformation?.RetryAfter ?? TimeSpan.FromSeconds(Math.Pow(retryPolicyOptions.Value.ExponentialBackOffExponent, attempt)), + (outcome, timespan, retryCount, ctx) => { RetryHandler.OnRetry(outcome, timespan, retryCount, ctx); return Task.CompletedTask; }); if (!string.IsNullOrWhiteSpace(batchAccountOptions.Value.AppKey)) { @@ -123,7 +116,51 @@ public AzureProxy(IOptions batchAccountOptions, IBatchPoolM //} } - internal AzureProxy() { } // TODO: Remove. Temporary WIP + /// + /// Rethrows exception if exception is and the Batch API Error Code returned otherwise invokes . + /// + /// Polly retry handler. + /// + private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenJobNotFound(RetryHandler.OnRetryHandler OnRetry) + => new((outcome, timespan, retryCount, correlationId) => + { + if (outcome is BatchException batchException && batchException.RequestInformation?.BatchError?.Code != BatchErrorCodeStrings.JobNotFound) + { + System.Runtime.ExceptionServices.ExceptionDispatchInfo.Capture(outcome).Throw(); + } + + OnRetry?.Invoke(outcome, timespan, retryCount, correlationId); + }); + + /// + /// Rethrows exception if exception is and the Batch API Error Code returned "NodeNotReady" otherwise invokes . + /// + /// Polly retry handler. + /// + private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenNodeNotReady(RetryHandler.OnRetryHandler OnRetry) + => new((outcome, timespan, retryCount, correlationId) => + { + if (outcome is BatchException batchException && !"NodeNotReady".Equals(batchException.RequestInformation?.BatchError?.Code, StringComparison.InvariantCultureIgnoreCase)) + { + System.Runtime.ExceptionServices.ExceptionDispatchInfo.Capture(outcome).Throw(); + } + + OnRetry?.Invoke(outcome, timespan, retryCount, correlationId); + }); + + /// + /// A logging Polly retry handler. + /// + /// Calling method name. + /// + private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string caller = default) + => new((exception, timeSpan, retryCount, correlationId) => + { + var requestId = (exception as BatchException)?.RequestInformation?.ServiceRequestId; + var reason = (exception.InnerException as Microsoft.Azure.Batch.Protocol.Models.BatchErrorException)?.Response?.ReasonPhrase; + logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan} BatchErrorCode: '{BatchErrorCode}', ApiStatusCode '{ApiStatusCode}', Reason: '{ReasonPhrase}' ServiceRequestId: '{ServiceRequestId}', CorrelationId: {CorrelationId:D}", + caller, retryCount, timeSpan, (exception as BatchException)?.RequestInformation?.BatchError?.Code, (exception as BatchException)?.RequestInformation?.HttpStatusCode, reason, requestId, correlationId); + }); // TODO: Static method because the instrumentation key is needed in both Program.cs and Startup.cs and we wanted to avoid intializing the batch client twice. // Can we skip initializing app insights with a instrumentation key in Program.cs? If yes, change this to an instance method. @@ -219,9 +256,11 @@ public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, strin ArgumentException.ThrowIfNullOrEmpty(jobId); logger.LogInformation("TES task: {TesTask} - Adding task to job {BatchJob}", tesTaskId, jobId); - var job = await batchRaceConditionJobNotFoundRetryPolicy.ExecuteAsync(ct => + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenJobNotFound(LogRetryErrorOnRetryHandler())); + var job = await batchRetryPolicy.ExecuteAsync((_, ct) => batchClient.JobOperations.GetJobAsync(jobId, cancellationToken: ct), - cancellationToken); + ctx, cancellationToken); await job.AddTaskAsync(cloudTask, cancellationToken: cancellationToken); logger.LogInformation("TES task: {TesTask} - Added task successfully", tesTaskId); @@ -264,7 +303,9 @@ public async Task TerminateBatchTaskAsync(string tesTaskId, string jobId, Cancel foreach (var task in batchTasksToTerminate) { logger.LogInformation("Terminating task {BatchTask}", task.Id); - await batchNodeNotReadyRetryPolicy.ExecuteAsync(ct => task.TerminateAsync(cancellationToken: ct), cancellationToken); + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenNodeNotReady(LogRetryErrorOnRetryHandler())); + await batchRetryPolicy.ExecuteAsync((_, ct) => task.TerminateAsync(cancellationToken: ct), ctx, cancellationToken); } } @@ -297,7 +338,9 @@ public async Task DeleteBatchTaskAsync(string tesTaskId, string jobId, Cancellat foreach (var task in batchTasksToDelete) { logger.LogInformation("Deleting task {BatchTask}", task.Id); - await batchNodeNotReadyRetryPolicy.ExecuteAsync(ct => task.DeleteAsync(cancellationToken: ct), cancellationToken); + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenNodeNotReady(LogRetryErrorOnRetryHandler())); + await batchRetryPolicy.ExecuteAsync((_, ct) => task.DeleteAsync(cancellationToken: ct), ctx, cancellationToken); } } diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index b027b2b3a..aa92a3b13 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -174,7 +174,6 @@ public BatchScheduler( static bool tesTaskIsInitializingOrRunning(TesTask tesTask) => tesTask.State == TesState.INITIALIZINGEnum || tesTask.State == TesState.RUNNINGEnum; static bool tesTaskIsQueuedOrInitializing(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum || tesTask.State == TesState.INITIALIZINGEnum; static bool tesTaskIsQueued(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum; - static bool tesTaskCancellationRequested(TesTask tesTask) => tesTask.State == TesState.CANCELINGEnum; static bool tesTaskDeletionReady(TesTask tesTask) => tesTask.IsTaskDeletionRequired; var setTaskStateLock = new object(); @@ -307,12 +306,12 @@ async Task SetTaskStateAfterFailureAsync(TesTask tesTask, TesState newTask Task RequeueTaskAfterFailureAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) => ++tesTask.ErrorCount > 3 - ? AddSystemLogAndSetTaskExecutorErrorAsync(tesTask, batchInfo, "System Error: Retry count exceeded.", cancellationToken) + ? AddSystemLogAndSetTaskSystemErrorAsync(tesTask, batchInfo, "System Error: Retry count exceeded.", cancellationToken) : SetTaskStateAfterFailureAsync(tesTask, TesState.QUEUEDEnum, batchInfo, cancellationToken); - Task AddSystemLogAndSetTaskExecutorErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, string additionalSystemLogItem, CancellationToken cancellationToken) + Task AddSystemLogAndSetTaskSystemErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, string additionalSystemLogItem, CancellationToken cancellationToken) { - return SetTaskExecutorError(tesTask, new(batchInfo, additionalSystemLogItem), cancellationToken); + return SetTaskSystemError(tesTask, new(batchInfo, additionalSystemLogItem), cancellationToken); } Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) @@ -329,8 +328,8 @@ Task HandleInfoUpdate(TesTask tesTask, CombinedBatchTaskInfo batchInfo, Ca tesTaskStateTransitions = new List() { - new TesTaskStateTransition(tesTaskDeletionReady, batchTaskState: null, alternateSystemLogItem: null, (tesTask, _, ct) => DeleteCancelledTaskAsync(tesTask, ct)), - new TesTaskStateTransition(tesTaskCancellationRequested, batchTaskState: null, alternateSystemLogItem: null, (tesTask, _, ct) => TerminateBatchTaskAsync(tesTask, ct)), + new TesTaskStateTransition(tesTaskDeletionReady, AzureBatchTaskState.TaskState.CancellationRequested, alternateSystemLogItem: null, (tesTask, _, ct) => DeleteCancelledTaskAsync(tesTask, ct)), + new TesTaskStateTransition(condition: null, AzureBatchTaskState.TaskState.CancellationRequested, alternateSystemLogItem: null, (tesTask, _, ct) => TerminateBatchTaskAsync(tesTask, ct)), //new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.JobNotFound, alternateSystemLogItem: null, (tesTask, _, ct) => AddBatchTaskAsync(tesTask, ct)), //new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.MissingBatchTask, alternateSystemLogItem: null, (tesTask, _, ct) => AddBatchTaskAsync(tesTask, ct)), new TesTaskStateTransition(tesTaskIsQueued, AzureBatchTaskState.TaskState.Initializing, alternateSystemLogItem: null, (tesTask, _) => { tesTask.State = TesState.INITIALIZINGEnum; return true; }), diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 44ae70368..111a07446 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -51,53 +51,56 @@ public CachingWithRetriesAzureProxy(IAzureProxy azureProxy, CachingRetryHandler BatchErrorCodeStrings.JobExists }; + private static readonly string[] DeletionErrorFoundCodes = new[] + { + BatchErrorCodeStrings.TaskNotFound, + BatchErrorCodeStrings.PoolNotFound, + BatchErrorCodeStrings.JobNotFound + }; + /// - /// Rethrows exception if exception is and the Batch API call returned otherwise invokes . + /// Rethrows exception if exception is and the Batch API call returned otherwise invokes . /// /// Polly retry handler. - /// Polly retry handler. - private static Action OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(Action OnRetry) - => new((outcome, timespan, retryCount) => + /// + private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(RetryHandler.OnRetryHandler OnRetry) + => new((outcome, timespan, retryCount, correlationId) => { if (outcome is BatchException batchException && CreationErrorFoundCodes.Contains(batchException.RequestInformation?.BatchError?.Code, StringComparer.OrdinalIgnoreCase)) { ExceptionDispatchInfo.Capture(outcome).Throw(); } - OnRetry?.Invoke(outcome, timespan, retryCount); + OnRetry?.Invoke(outcome, timespan, retryCount, correlationId); }); /// /// Rethrows exception if exception is and the Batch API call returned otherwise invokes . /// /// Polly retry handler. - /// Polly retry handler. - private static Action OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(Action OnRetry) - => new((outcome, timespan, retryCount) => + /// + private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(RetryHandler.OnRetryHandler OnRetry) + => new((outcome, timespan, retryCount, correlationId) => { - if (outcome is BatchException batchException && batchException.RequestInformation.HttpStatusCode == System.Net.HttpStatusCode.NotFound) + if (outcome is BatchException batchException && DeletionErrorFoundCodes.Contains(batchException.RequestInformation?.BatchError?.Code, StringComparer.OrdinalIgnoreCase)) { ExceptionDispatchInfo.Capture(outcome).Throw(); } - OnRetry?.Invoke(outcome, timespan, retryCount); + OnRetry?.Invoke(outcome, timespan, retryCount, correlationId); }); - private Action LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string caller = default) - => new((exception, retryCount, timeSpan) => + /// + /// A logging Polly retry handler. + /// + /// Calling method name. + /// + private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string caller = default) + => new((exception, timeSpan, retryCount, correlationId) => { - logger?.LogError(exception, @"Retrying {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan}", caller, retryCount, timeSpan); + logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan} CorrelationId: {CorrelationId:D}", caller, retryCount, timeSpan, correlationId); }); - //private Action, TimeSpan, int> LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string caller = default) - // => new((result, retryCount, timeSpan) => - // { - // if (result.Exception is not null) - // { - // logger?.LogError(result.Exception, @"Retrying {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan}", caller, retryCount, timeSpan); - // } - // }); - /// public async Task CreateBatchJobAsync(string jobId, CancellationToken cancellationToken) @@ -126,19 +129,29 @@ public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, strin } /// - public Task DeleteBatchJobAsync(string jobId, CancellationToken cancellationToken) + public async Task DeleteBatchJobAsync(string jobId, CancellationToken cancellationToken) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(jobId, ct), cancellationToken, ctx); + try + { + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(LogRetryErrorOnRetryHandler())); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(jobId, ct), cancellationToken, ctx); + } + catch (BatchException exc) when (BatchErrorCodeStrings.JobNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) + { } } /// - public Task DeleteBatchTaskAsync(string tesTaskId, string jobId, CancellationToken cancellationToken) + public async Task DeleteBatchTaskAsync(string tesTaskId, string jobId, CancellationToken cancellationToken) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(tesTaskId, jobId, ct), cancellationToken, ctx); + try + { + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(LogRetryErrorOnRetryHandler())); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(tesTaskId, jobId, ct), cancellationToken, ctx); + } + catch (BatchException exc) when (BatchErrorCodeStrings.TaskNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) + { } } /// @@ -150,11 +163,16 @@ public Task TerminateBatchTaskAsync(string tesTaskId, string jobId, Cancellation } /// - public Task DeleteBatchPoolAsync(string poolId, CancellationToken cancellationToken) + public async Task DeleteBatchPoolAsync(string poolId, CancellationToken cancellationToken) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchPoolAsync(poolId, ct), cancellationToken, ctx); + try + { + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(LogRetryErrorOnRetryHandler())); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchPoolAsync(poolId, ct), cancellationToken, ctx); + } + catch (BatchException exc) when (BatchErrorCodeStrings.PoolNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) + { } } /// diff --git a/src/TesApi.Web/Management/ContainerRegistryProvider.cs b/src/TesApi.Web/Management/ContainerRegistryProvider.cs index a0515f1ab..0fcede6d7 100644 --- a/src/TesApi.Web/Management/ContainerRegistryProvider.cs +++ b/src/TesApi.Web/Management/ContainerRegistryProvider.cs @@ -100,7 +100,8 @@ public bool IsImagePublic(string imageName) private async Task LookUpAndAddToCacheContainerRegistryInfoAsync(string imageName, CancellationToken cancellationToken) { var ctx = new Polly.Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount) => Logger.LogError(outcome, @"Retrying {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan}", nameof(LookUpAndAddToCacheContainerRegistryInfoAsync), retryCount, timespan)); + ctx.SetOnRetryHandler((outcome, timespan, retryCount, correlationId) => + Logger.LogError(outcome, @"Retrying in {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan} CorrelationId: {CorrelationId:D}", nameof(LookUpAndAddToCacheContainerRegistryInfoAsync), retryCount, timespan, correlationId)); var repositories = await CachingRetryHandler.ExecuteWithRetryAsync(GetAccessibleContainerRegistriesAsync, cancellationToken: cancellationToken, context: ctx); var requestedRepo = repositories?.FirstOrDefault(reg => From c1c68f4a9a24259a6117fd5f395378c3548e82c9 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 8 Nov 2023 03:43:11 -0800 Subject: [PATCH 074/202] Yet another attempt to get the desired info --- src/TesApi.Web/AzureProxy.cs | 3 +- src/TesApi.Web/BatchPool.cs | 91 ++++++++++++------- src/TesApi.Web/BatchScheduler.cs | 8 +- .../CachingWithRetriesAzureProxy.cs | 2 +- src/TesApi.Web/IBatchScheduler.cs | 7 -- 5 files changed, 65 insertions(+), 46 deletions(-) diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 93c92a885..e14cd49db 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -351,6 +351,7 @@ public IAsyncEnumerable GetActivePoolsAsync(string hostName) { FilterClause = "state eq 'active'", SelectClause = BatchPool.CloudPoolSelectClause + ",identity", + ExpandClause = "identity,metadata" }; return batchClient.PoolOperations.ListPools(activePoolsFilter).ToAsyncEnumerable() @@ -528,7 +529,7 @@ public async Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, boo var poolId = await batchPoolManager.CreateBatchPoolAsync(poolInfo, isPreemptable, cancellationToken); logger.LogInformation("Successfully created batch pool named {PoolName} with vmSize {PoolVmSize} and low priority {IsPreemptable}", poolInfo.Name, poolInfo.VmSize, isPreemptable); - return await batchClient.PoolOperations.GetPoolAsync(poolId, detailLevel: new ODATADetailLevel { SelectClause = BatchPool.CloudPoolSelectClause }, cancellationToken: cancellationToken); + return await batchClient.PoolOperations.GetPoolAsync(poolId, detailLevel: new ODATADetailLevel { SelectClause = BatchPool.CloudPoolSelectClause, ExpandClause = "metadata" }, cancellationToken: cancellationToken); } // https://learn.microsoft.com/azure/azure-resource-manager/management/move-resource-group-and-subscription#changed-resource-id diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index d0146a9f5..adacd7602 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -60,11 +60,11 @@ public BatchPool(IBatchScheduler batchScheduler, IOptions StartTaskFailures { get; } = new(); private Queue ResizeErrors { get; } = new(); - private IAsyncEnumerable GetTasksAsync(string select, string filter) - => _removedFromService ? AsyncEnumerable.Empty() : _azureProxy.ListTasksAsync(Id, new ODATADetailLevel { SelectClause = select, FilterClause = filter }); + private IAsyncEnumerable GetTasksAsync(string select, string filter, string expand) + => _removedFromService ? AsyncEnumerable.Empty() : _azureProxy.ListTasksAsync(Id, new ODATADetailLevel { SelectClause = select, FilterClause = filter, ExpandClause = expand }); internal IAsyncEnumerable GetTasksAsync(bool includeCompleted) - => GetTasksAsync("id,stateTransitionTime", includeCompleted ? default : "state ne 'completed'"); + => GetTasksAsync("id,stateTransitionTime", includeCompleted ? default : "state ne 'completed'", null); private async ValueTask RemoveNodesAsync(IList nodesToRemove, CancellationToken cancellationToken) { @@ -315,6 +315,27 @@ public static string AutoPoolFormula(bool preemptable, int initialTarget) private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken cancellationToken) { + var nodeList = await GetNodesToRemove(false).ToDictionaryAsync(node => node.Id, cancellationToken: cancellationToken); + await foreach (var task in _azureProxy.ListTasksAsync(Id, new ODATADetailLevel { SelectClause = "id,executionInfo,nodeInfo", ExpandClause = "executionInfo,nodeInfo" }).WithCancellation(cancellationToken)) + { + var nodeId = task.ComputeNodeInformation?.ComputeNodeId; + if (nodeId is not null && nodeList.ContainsKey(nodeId)) + { + await _azureProxy.UploadBlobAsync( + new Uri(await _storageAccessProvider.GetInternalTesBlobUrlAsync( + $"nodeError/{nodeId}/{task.Id}-{new Guid():N}", + Azure.Storage.Sas.BlobSasPermissions.Create, + cancellationToken)), + System.Text.Json.JsonSerializer.Serialize(task.ExecutionInformation, + new System.Text.Json.JsonSerializerOptions() + { + DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingDefault, + Converters = { new System.Text.Json.Serialization.JsonStringEnumConverter(System.Text.Json.JsonNamingPolicy.CamelCase) } + }), + cancellationToken); + } + } + // This method implememts a state machine to disable/enable autoscaling as needed to clear certain conditions that can be observed // Inputs are _resetAutoScalingRequired, compute nodes in ejectable states, and the current _scalingMode, along with the pool's // allocation state and autoscale enablement. @@ -345,31 +366,6 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc { var nodesToRemove = Enumerable.Empty(); - async Task SendNodeErrorData(string nodeId, IReadOnlyList content) - { - var url = new Uri(await _storageAccessProvider.GetInternalTesBlobUrlAsync( - $"nodeError/{nodeId}-{new Guid():N}", - Azure.Storage.Sas.BlobSasPermissions.Create, - cancellationToken)); - - if (content is null || content!.Any()) - { - await _azureProxy.UploadBlobAsync(url, "No recent tasks found on node.", cancellationToken); - } - else - { - await _azureProxy.UploadBlobAsync( - url, - System.Text.Json.JsonSerializer.Serialize(content, - new System.Text.Json.JsonSerializerOptions() - { - DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingDefault, - Converters = { new System.Text.Json.Serialization.JsonStringEnumConverter(System.Text.Json.JsonNamingPolicy.CamelCase) } - }), - cancellationToken); - } - } - // It's documented that a max of 100 nodes can be removed at a time. Excess eligible nodes will be removed in a future call to this method. await foreach (var node in GetNodesToRemove(true).Take(MaxComputeNodesToRemoveAtOnce).WithCancellation(cancellationToken)) { @@ -377,7 +373,7 @@ await _azureProxy.UploadBlobAsync( { case ComputeNodeState.Unusable: _logger.LogDebug("Found unusable node {NodeId}", node.Id); - await SendNodeErrorData(node.Id, node.RecentTasks); + await SendNodeTaskInformation(node.Id, node.RecentTasks); //node.RecentTasks[0].ExecutionInformation.FailureInformation.Code == TaskFailureInformationCodes.DiskFull // TODO: notify running tasks that task will switch nodes? break; @@ -389,7 +385,7 @@ await _azureProxy.UploadBlobAsync( case ComputeNodeState.Preempted: _logger.LogDebug("Found preempted node {NodeId}", node.Id); - await SendNodeErrorData(node.Id, node.RecentTasks); + await SendNodeTaskInformation(node.Id, node.RecentTasks); //node.RecentTasks[0].TaskId //node.RecentTasks[0].ExecutionInformation.FailureInformation.Category == ErrorCategory.ServerError // TODO: notify running tasks that task will switch nodes? Or, in the future, terminate the task? @@ -454,6 +450,31 @@ async ValueTask GetTaskCountAsync(int @default) // Used to make reenabling IAsyncEnumerable GetNodesToRemove(bool withState) => _azureProxy.ListComputeNodesAsync(Id, new ODATADetailLevel(filterClause: @"state eq 'starttaskfailed' or state eq 'preempted' or state eq 'unusable'", selectClause: withState ? @"id,recentTasks,state,startTaskInfo" : @"id")); + + async Task SendNodeTaskInformation(string nodeId, IReadOnlyList content) + { + var url = new Uri(await _storageAccessProvider.GetInternalTesBlobUrlAsync( + $"nodeError/{nodeId}-{new Guid():N}", + Azure.Storage.Sas.BlobSasPermissions.Create, + cancellationToken)); + + if (content is null || content!.Any()) + { + await _azureProxy.UploadBlobAsync(url, "No recent tasks found on node.", cancellationToken); + } + else + { + await _azureProxy.UploadBlobAsync( + url, + System.Text.Json.JsonSerializer.Serialize(content, + new System.Text.Json.JsonSerializerOptions() + { + DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingDefault, + Converters = { new System.Text.Json.Serialization.JsonStringEnumConverter(System.Text.Json.JsonNamingPolicy.CamelCase) } + }), + cancellationToken); + } + } } private bool DetermineIsAvailable(DateTime? creation) @@ -666,7 +687,7 @@ async ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken ca /// public IAsyncEnumerable GetTaskResizeFailuresAsync(CancellationToken cancellationToken) { - return GetTasksAsync("id", "state eq 'active'").Zip( + return GetTasksAsync("id", "state eq 'active'", null).Zip( GetFailures(cancellationToken), (cloud, state) => new CloudTaskBatchTaskState(cloud.Id, state)); @@ -711,7 +732,7 @@ TaskFailureInformation PopNextStartTaskFailure() /// public IAsyncEnumerable GetCompletedTasksAsync(CancellationToken _1) - => GetTasksAsync("id,executionInfo", $"state eq 'completed' and stateTransitionTime lt DateTime'{DateTime.UtcNow - TimeSpan.FromMinutes(2):O}'"); + => GetTasksAsync("id,executionInfo", $"state eq 'completed' and stateTransitionTime lt DateTime'{DateTime.UtcNow - TimeSpan.FromMinutes(2):O}'", "executionInfo"); /// public async ValueTask GetAllocationStateTransitionTimeAsync(CancellationToken cancellationToken = default) @@ -781,10 +802,10 @@ public async ValueTask AssignPoolAsync(CloudPool pool, bool forceRemove, Cancell throw new ArgumentException("CloudPool is either not configured correctly or was not retrieved with all required metadata.", nameof(pool)); } - // Pool is "broken" if job is missing/not active. Reject this pool via the side effect of the exception that is thrown. - if (1 != (await _azureProxy.GetBatchJobAsync(pool.Id, cancellationToken, new ODATADetailLevel { SelectClause = "id,state"/*, FilterClause = "state eq 'active'"*/ }).ToAsyncEnumerable().Where(j => j.State == JobState.Active).ToListAsync(cancellationToken)).Count) + // Pool is "broken" if its associated job is missing/not active. Reject this pool via the side effect of the exception that is thrown. + var job = (await _azureProxy.GetBatchJobAsync(pool.Id, cancellationToken, new ODATADetailLevel { SelectClause = "poolInfo,state", ExpandClause = "poolInfo" })); + if (job.State != JobState.Active || !pool.Id.Equals(job.PoolInformation?.PoolId, StringComparison.OrdinalIgnoreCase)) { - // TODO: investigate why FilterClause throws "Type Microsoft.Azure.Batch.Protocol.BatchRequests.JobGetBatchRequest does not support a filter clause. (Parameter 'detailLevel')" throw new InvalidOperationException($"Active Job not found for Pool {pool.Id}"); } diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index aa92a3b13..8ec1ab4ed 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -402,8 +402,12 @@ private string CreateWgetDownloadCommand(string urlToDownload, string localFileP return command; } - /// - public IAsyncEnumerable GetCloudPools(CancellationToken cancellationToken) + /// + /// Retrieves pools associated with this TES from the batch account. + /// + /// A for controlling the lifetime of the asynchronous operation. + /// + private IAsyncEnumerable GetCloudPools(CancellationToken cancellationToken) => azureProxy.GetActivePoolsAsync(batchPrefix); /// diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 111a07446..34ba90229 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -330,7 +330,7 @@ public async Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, boo } catch (BatchException exc) when (BatchErrorCodeStrings.PoolExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { - return await GetBatchPoolAsync(poolInfo.Name, cancellationToken, new ODATADetailLevel { SelectClause = BatchPool.CloudPoolSelectClause }); + return await GetBatchPoolAsync(poolInfo.Name, cancellationToken, new ODATADetailLevel { SelectClause = BatchPool.CloudPoolSelectClause, ExpandClause = "metadata" }); } } diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index 3f1c41711..52bdd6633 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -73,13 +73,6 @@ public interface IBatchScheduler /// Task DeletePoolAsync(IBatchPool pool, CancellationToken cancellationToken); - /// - /// Retrieves pools associated with this TES from the batch account. - /// - /// A for controlling the lifetime of the asynchronous operation. - /// - IAsyncEnumerable GetCloudPools(CancellationToken cancellationToken); - /// /// Removes pool from list of managed pools. /// From e2b897be964713e9cc5d83fab572efe68f215f53 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 8 Nov 2023 09:26:25 -0800 Subject: [PATCH 075/202] Property 'identity' on type 'Microsoft.Azure.Batch.Protocol.Entities.Pool' is not a navigation property. Only navigation properties can be expanded. --- src/TesApi.Web/AzureProxy.cs | 2 +- src/TesApi.Web/BatchPool.cs | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index e14cd49db..ec7806cbe 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -351,7 +351,7 @@ public IAsyncEnumerable GetActivePoolsAsync(string hostName) { FilterClause = "state eq 'active'", SelectClause = BatchPool.CloudPoolSelectClause + ",identity", - ExpandClause = "identity,metadata" + ExpandClause = "metadata" }; return batchClient.PoolOperations.ListPools(activePoolsFilter).ToAsyncEnumerable() diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index adacd7602..dff05fb4f 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -210,8 +210,11 @@ private async ValueTask ServicePoolGetResizeErrorsAsync(CancellationToken cancel var pool = await _azureProxy.GetBatchPoolAsync(Id, cancellationToken, new ODATADetailLevel { SelectClause = autoScaleEnabled ?? false - ? "id,allocationStateTransitionTime,autoScaleFormula,autoScaleRun,resizeErrors" - : "id,allocationStateTransitionTime,resizeErrors" + ? "id,allocationStateTransitionTime,autoScaleRun,resizeErrors" + : "id,allocationStateTransitionTime,resizeErrors", + ExpandClause = autoScaleEnabled ?? false + ? "autoScaleRun,resizeErrors" + : "resizeErrors", }); if ((autoScaleEnabled ?? false) && pool.AutoScaleRun?.Error is not null) @@ -803,7 +806,7 @@ public async ValueTask AssignPoolAsync(CloudPool pool, bool forceRemove, Cancell } // Pool is "broken" if its associated job is missing/not active. Reject this pool via the side effect of the exception that is thrown. - var job = (await _azureProxy.GetBatchJobAsync(pool.Id, cancellationToken, new ODATADetailLevel { SelectClause = "poolInfo,state", ExpandClause = "poolInfo" })); + var job = (await _azureProxy.GetBatchJobAsync(pool.Id, cancellationToken, new ODATADetailLevel { SelectClause = "poolInfo,state" })); if (job.State != JobState.Active || !pool.Id.Equals(job.PoolInformation?.PoolId, StringComparison.OrdinalIgnoreCase)) { throw new InvalidOperationException($"Active Job not found for Pool {pool.Id}"); From 31a34e5d675eafd2b1ab96fb33127ffc45910772 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 8 Nov 2023 10:28:09 -0800 Subject: [PATCH 076/202] https://learn.microsoft.com/azure/batch/batch-efficient-list-queries#expand --- src/TesApi.Web/AzureProxy.cs | 5 ++-- src/TesApi.Web/BatchPool.cs | 26 +++++++------------ .../CachingWithRetriesAzureProxy.cs | 2 +- 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index ec7806cbe..090df2a45 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -350,8 +350,7 @@ public IAsyncEnumerable GetActivePoolsAsync(string hostName) var activePoolsFilter = new ODATADetailLevel { FilterClause = "state eq 'active'", - SelectClause = BatchPool.CloudPoolSelectClause + ",identity", - ExpandClause = "metadata" + SelectClause = BatchPool.CloudPoolSelectClause + ",identity" }; return batchClient.PoolOperations.ListPools(activePoolsFilter).ToAsyncEnumerable() @@ -529,7 +528,7 @@ public async Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, boo var poolId = await batchPoolManager.CreateBatchPoolAsync(poolInfo, isPreemptable, cancellationToken); logger.LogInformation("Successfully created batch pool named {PoolName} with vmSize {PoolVmSize} and low priority {IsPreemptable}", poolInfo.Name, poolInfo.VmSize, isPreemptable); - return await batchClient.PoolOperations.GetPoolAsync(poolId, detailLevel: new ODATADetailLevel { SelectClause = BatchPool.CloudPoolSelectClause, ExpandClause = "metadata" }, cancellationToken: cancellationToken); + return await batchClient.PoolOperations.GetPoolAsync(poolId, detailLevel: new ODATADetailLevel { SelectClause = BatchPool.CloudPoolSelectClause }, cancellationToken: cancellationToken); } // https://learn.microsoft.com/azure/azure-resource-manager/management/move-resource-group-and-subscription#changed-resource-id diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index dff05fb4f..26b6713a3 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -23,7 +23,7 @@ public sealed partial class BatchPool /// /// Minimum property set required for provided to constructors of this class /// - public const string CloudPoolSelectClause = "id,creationTime,metadata"; + public const string CloudPoolSelectClause = "creationTime,id,metadata"; /// /// Autoscale evalutation interval @@ -60,11 +60,11 @@ public BatchPool(IBatchScheduler batchScheduler, IOptions StartTaskFailures { get; } = new(); private Queue ResizeErrors { get; } = new(); - private IAsyncEnumerable GetTasksAsync(string select, string filter, string expand) - => _removedFromService ? AsyncEnumerable.Empty() : _azureProxy.ListTasksAsync(Id, new ODATADetailLevel { SelectClause = select, FilterClause = filter, ExpandClause = expand }); + private IAsyncEnumerable GetTasksAsync(string select, string filter) + => _removedFromService ? AsyncEnumerable.Empty() : _azureProxy.ListTasksAsync(Id, new ODATADetailLevel { SelectClause = select, FilterClause = filter }); internal IAsyncEnumerable GetTasksAsync(bool includeCompleted) - => GetTasksAsync("id,stateTransitionTime", includeCompleted ? default : "state ne 'completed'", null); + => GetTasksAsync("id,stateTransitionTime", includeCompleted ? default : "state ne 'completed'"); private async ValueTask RemoveNodesAsync(IList nodesToRemove, CancellationToken cancellationToken) { @@ -207,15 +207,7 @@ private async ValueTask ServicePoolGetResizeErrorsAsync(CancellationToken cancel if (allocationState == AllocationState.Steady) { - var pool = await _azureProxy.GetBatchPoolAsync(Id, cancellationToken, new ODATADetailLevel - { - SelectClause = autoScaleEnabled ?? false - ? "id,allocationStateTransitionTime,autoScaleRun,resizeErrors" - : "id,allocationStateTransitionTime,resizeErrors", - ExpandClause = autoScaleEnabled ?? false - ? "autoScaleRun,resizeErrors" - : "resizeErrors", - }); + var pool = await _azureProxy.GetBatchPoolAsync(Id, cancellationToken, new ODATADetailLevel { SelectClause = "allocationStateTransitionTime,id,resizeErrors" + (autoScaleEnabled ?? false ? ",autoScaleRun" : string.Empty) }); if ((autoScaleEnabled ?? false) && pool.AutoScaleRun?.Error is not null) { @@ -319,7 +311,7 @@ public static string AutoPoolFormula(bool preemptable, int initialTarget) private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken cancellationToken) { var nodeList = await GetNodesToRemove(false).ToDictionaryAsync(node => node.Id, cancellationToken: cancellationToken); - await foreach (var task in _azureProxy.ListTasksAsync(Id, new ODATADetailLevel { SelectClause = "id,executionInfo,nodeInfo", ExpandClause = "executionInfo,nodeInfo" }).WithCancellation(cancellationToken)) + await foreach (var task in _azureProxy.ListTasksAsync(Id, new ODATADetailLevel { SelectClause = "id,executionInfo,nodeInfo" }).WithCancellation(cancellationToken)) { var nodeId = task.ComputeNodeInformation?.ComputeNodeId; if (nodeId is not null && nodeList.ContainsKey(nodeId)) @@ -690,7 +682,7 @@ async ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken ca /// public IAsyncEnumerable GetTaskResizeFailuresAsync(CancellationToken cancellationToken) { - return GetTasksAsync("id", "state eq 'active'", null).Zip( + return GetTasksAsync("id", "state eq 'active'").Zip( GetFailures(cancellationToken), (cloud, state) => new CloudTaskBatchTaskState(cloud.Id, state)); @@ -735,7 +727,7 @@ TaskFailureInformation PopNextStartTaskFailure() /// public IAsyncEnumerable GetCompletedTasksAsync(CancellationToken _1) - => GetTasksAsync("id,executionInfo", $"state eq 'completed' and stateTransitionTime lt DateTime'{DateTime.UtcNow - TimeSpan.FromMinutes(2):O}'", "executionInfo"); + => GetTasksAsync("executionInfo,id", $"state eq 'completed' and stateTransitionTime lt DateTime'{DateTime.UtcNow - TimeSpan.FromMinutes(2):O}'"); /// public async ValueTask GetAllocationStateTransitionTimeAsync(CancellationToken cancellationToken = default) @@ -806,7 +798,7 @@ public async ValueTask AssignPoolAsync(CloudPool pool, bool forceRemove, Cancell } // Pool is "broken" if its associated job is missing/not active. Reject this pool via the side effect of the exception that is thrown. - var job = (await _azureProxy.GetBatchJobAsync(pool.Id, cancellationToken, new ODATADetailLevel { SelectClause = "poolInfo,state" })); + var job = await _azureProxy.GetBatchJobAsync(pool.Id, cancellationToken, new ODATADetailLevel { SelectClause = "poolInfo,state" }); if (job.State != JobState.Active || !pool.Id.Equals(job.PoolInformation?.PoolId, StringComparison.OrdinalIgnoreCase)) { throw new InvalidOperationException($"Active Job not found for Pool {pool.Id}"); diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 34ba90229..111a07446 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -330,7 +330,7 @@ public async Task CreateBatchPoolAsync(BatchModels.Pool poolInfo, boo } catch (BatchException exc) when (BatchErrorCodeStrings.PoolExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { - return await GetBatchPoolAsync(poolInfo.Name, cancellationToken, new ODATADetailLevel { SelectClause = BatchPool.CloudPoolSelectClause, ExpandClause = "metadata" }); + return await GetBatchPoolAsync(poolInfo.Name, cancellationToken, new ODATADetailLevel { SelectClause = BatchPool.CloudPoolSelectClause }); } } From 3e1b66e42343f6534bd7a20a1bae56bab1b4ba52 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 8 Nov 2023 21:18:09 -0800 Subject: [PATCH 077/202] Fix GetInternalTesTaskBlobUrlWithoutSasToken() --- .../DefaultStorageAccessProviderTests.cs | 4 +-- .../Storage/DefaultStorageAccessProvider.cs | 34 ++++++++----------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/src/TesApi.Tests/Storage/DefaultStorageAccessProviderTests.cs b/src/TesApi.Tests/Storage/DefaultStorageAccessProviderTests.cs index b18796f18..a25712772 100644 --- a/src/TesApi.Tests/Storage/DefaultStorageAccessProviderTests.cs +++ b/src/TesApi.Tests/Storage/DefaultStorageAccessProviderTests.cs @@ -31,7 +31,7 @@ public class DefaultStorageAccessProviderTests public void Setup() { azureProxyMock = new Mock(); - storageOptions = new StorageOptions() { DefaultAccountName = DefaultStorageAccountName }; + storageOptions = new StorageOptions() { DefaultAccountName = DefaultStorageAccountName, ExecutionsContainerName = StorageAccessProvider.TesExecutionsPathPrefix }; var subscriptionId = Guid.NewGuid().ToString(); storageAccountInfo = new StorageAccountInfo() { @@ -83,7 +83,7 @@ public async Task GetInternalTesTaskBlobUrlAsync_BlobPathAndInternalPathPrefixIs Assert.IsNotNull(url); var uri = new Uri(url); - Assert.AreEqual($"{StorageAccountBlobEndpoint}/{internalPathPrefix}/{blobName.TrimStart('/')}", ToHostWithAbsolutePathOnly(uri)); + Assert.AreEqual($"{StorageAccountBlobEndpoint}{StorageAccessProvider.TesExecutionsPathPrefix}/{internalPathPrefix}/{blobName.TrimStart('/')}", ToHostWithAbsolutePathOnly(uri)); } private static TesTask CreateNewTesTask() diff --git a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs index 482f55225..50db2f025 100644 --- a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs @@ -247,16 +247,23 @@ private static string NormalizedBlobPath(string blobPath) public override string GetInternalTesTaskBlobUrlWithoutSasToken(TesTask task, string blobPath) { var normalizedBlobPath = NormalizedBlobPath(blobPath); - var blobPathWithPrefix = $"{task.Id}{normalizedBlobPath}"; + var blobPathWithPrefix = $"{TesExecutionsPathPrefix}/{task.Id}{normalizedBlobPath}"; + if (task.Resources?.ContainsBackendParameterValue(TesResources.SupportedBackendParameters .internal_path_prefix) == true) { blobPathWithPrefix = $"{task.Resources.GetBackendParameterValue(TesResources.SupportedBackendParameters.internal_path_prefix).Trim('/')}{normalizedBlobPath}"; + + if (storageOptions.ExecutionsContainerName is not null && + !blobPathWithPrefix.StartsWith(storageOptions.ExecutionsContainerName, StringComparison.OrdinalIgnoreCase)) + { + blobPathWithPrefix = $"{storageOptions.ExecutionsContainerName}/{blobPathWithPrefix}"; + } } //passing the resulting string through the builder to ensure that the path is properly encoded and valid - var builder = new BlobUriBuilder(new Uri($"https://{storageOptions.DefaultAccountName}.blob.core.windows.net{TesExecutionsPathPrefix}/{blobPathWithPrefix.TrimStart('/')}")); + var builder = new BlobUriBuilder(new Uri($"https://{storageOptions.DefaultAccountName}.blob.core.windows.net/{blobPathWithPrefix.TrimStart('/')}")); return builder.ToUri().ToString(); } @@ -275,25 +282,12 @@ public override string GetInternalTesBlobUrlWithoutSasToken(string blobPath) /// public override async Task GetInternalTesTaskBlobUrlAsync(TesTask task, string blobPath, BlobSasPermissions sasPermissions, CancellationToken cancellationToken) { - var normalizedBlobPath = NormalizedBlobPath(blobPath); - - if (task.Resources?.ContainsBackendParameterValue(TesResources.SupportedBackendParameters - .internal_path_prefix) == true) - { - var internalPath = $"{task.Resources.GetBackendParameterValue(TesResources.SupportedBackendParameters.internal_path_prefix).Trim('/')}{normalizedBlobPath}"; + var pathSegments = StorageAccountUrlSegments.Create(GetInternalTesTaskBlobUrlWithoutSasToken(task, blobPath)); - if (storageOptions.ExecutionsContainerName is not null && - !internalPath.StartsWith(storageOptions.ExecutionsContainerName, StringComparison.OrdinalIgnoreCase)) - { - internalPath = $"{storageOptions.ExecutionsContainerName}/{internalPath}"; - } - - var blobPathWithPathPrefix = - $"/{storageOptions.DefaultAccountName}/{internalPath}"; - return await MapLocalPathToSasUrlAsync(blobPathWithPathPrefix, sasPermissions, cancellationToken, sasTokenDuration: default); - } - - return await GetInternalTesBlobUrlAsync($"/{task.Id}{normalizedBlobPath}", sasPermissions, cancellationToken); + var resultPathSegments = pathSegments.IsContainer + ? await AddSasTokenAsync(pathSegments, SasTokenDuration, ConvertSasPermissions(sasPermissions, nameof(sasPermissions)), cancellationToken) + : await AddSasTokenAsync(pathSegments, SasTokenDuration, sasPermissions, cancellationToken); + return resultPathSegments.ToUriString(); } private async Task TryGetStorageAccountInfoAsync(string accountName, CancellationToken cancellationToken, Action onSuccess = null) From 92d194765f7351da4338deee8cdf396aac9224c1 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 8 Nov 2023 21:22:46 -0800 Subject: [PATCH 078/202] Resolve file upload logs in runner --- .../Storage/FileOperationResolver.cs | 7 ++++-- .../Storage/ResolutionPolicyHandler.cs | 2 +- src/Tes.Runner/Transfer/BlobUploader.cs | 11 +++++++++- src/Tes.Runner/Transfer/UploadInfo.cs | 2 +- .../Events/RunnerEventsProcessor.cs | 22 ++++--------------- 5 files changed, 21 insertions(+), 23 deletions(-) diff --git a/src/Tes.Runner/Storage/FileOperationResolver.cs b/src/Tes.Runner/Storage/FileOperationResolver.cs index 5b8b52e8d..e2bddad2a 100644 --- a/src/Tes.Runner/Storage/FileOperationResolver.cs +++ b/src/Tes.Runner/Storage/FileOperationResolver.cs @@ -182,23 +182,26 @@ private IEnumerable ExpandFileOutput(FileOutput output) } } - private static FileOutput CreateExpandedFileOutputWithCombinedTargetUrl(FileOutput output, string absoluteFilePath, string relativePathToSearchPath) + private FileOutput CreateExpandedFileOutputWithCombinedTargetUrl(FileOutput output, string absoluteFilePath, string relativePathToSearchPath) { return new FileOutput() { Path = absoluteFilePath, TargetUrl = ToCombinedTargetUrl(output.TargetUrl!, prefixToRemoveFromPath: string.Empty, relativePathToSearchPath), TransformationStrategy = output.TransformationStrategy, + MountParentDirectory = string.IsNullOrWhiteSpace(output.MountParentDirectory) ? null : fileInfoProvider.GetExpandedFileName(output.MountParentDirectory), FileType = FileType.File, }; } - private static FileOutput CreateExpandedFileOutputUsingTargetUrl(FileOutput output, string absoluteFilePath) + + private FileOutput CreateExpandedFileOutputUsingTargetUrl(FileOutput output, string absoluteFilePath) { return new FileOutput() { Path = absoluteFilePath, TargetUrl = output.TargetUrl, TransformationStrategy = output.TransformationStrategy, + MountParentDirectory = string.IsNullOrWhiteSpace(output.MountParentDirectory) ? null : fileInfoProvider.GetExpandedFileName(output.MountParentDirectory), FileType = FileType.File, }; } diff --git a/src/Tes.Runner/Storage/ResolutionPolicyHandler.cs b/src/Tes.Runner/Storage/ResolutionPolicyHandler.cs index d39133997..1af3ca0a2 100644 --- a/src/Tes.Runner/Storage/ResolutionPolicyHandler.cs +++ b/src/Tes.Runner/Storage/ResolutionPolicyHandler.cs @@ -95,7 +95,7 @@ private async Task CreateUploadInfoWithStrategyAsync(FileOutput outp { var uri = await ApplySasResolutionToUrlAsync(output.TargetUrl, output.TransformationStrategy, uploadBlobSasPermissions, runtimeOptions); - return new UploadInfo(output.Path!, uri); + return new UploadInfo(output.Path!, uri, output.MountParentDirectory); } private static async Task ApplySasResolutionToUrlAsync(string? sourceUrl, TransformationStrategy? strategy, diff --git a/src/Tes.Runner/Transfer/BlobUploader.cs b/src/Tes.Runner/Transfer/BlobUploader.cs index 6804536f3..4c661b23a 100644 --- a/src/Tes.Runner/Transfer/BlobUploader.cs +++ b/src/Tes.Runner/Transfer/BlobUploader.cs @@ -14,6 +14,7 @@ public class BlobUploader : BlobOperationPipeline { private readonly ConcurrentDictionary hashListProviders = new(); + internal readonly IDictionary MapPathToMountPrefixLength = new Dictionary(); internal readonly ConcurrentBag CompletedFiles = new(); public BlobUploader(BlobPipelineOptions pipelineOptions, Channel memoryBufferPool) : base(pipelineOptions, memoryBufferPool) @@ -143,7 +144,10 @@ public override async Task OnCompletionAsync(long length, Uri? blobUrl, string f } finally { - CompletedFiles.Add(new(length, blobUrl, fileName)); + CompletedFiles.Add(new( + length, + new Azure.Storage.Blobs.BlobUriBuilder(blobUrl) { Sas = null }.ToUri(), + MapPathToMountPrefixLength.TryGetValue(fileName, out var prefixLength) ? fileName[prefixLength..] : fileName)); response?.Dispose(); } } @@ -158,6 +162,11 @@ public virtual async Task UploadAsync(List uploadList) { ValidateUploadList(uploadList); + foreach (var upload in uploadList.Where(upload => !string.IsNullOrWhiteSpace(upload.MountParentDirectory)).Where(upload => upload.FullFilePath.StartsWith(upload.MountParentDirectory!))) + { + MapPathToMountPrefixLength[upload.FullFilePath] = upload.MountParentDirectory!.Length; + } + var operationList = uploadList.Select(d => new BlobOperationInfo(d.TargetUri, d.FullFilePath, d.FullFilePath, true)).ToList(); return await ExecutePipelineAsync(operationList); diff --git a/src/Tes.Runner/Transfer/UploadInfo.cs b/src/Tes.Runner/Transfer/UploadInfo.cs index d1b0d6655..bc879b6fb 100644 --- a/src/Tes.Runner/Transfer/UploadInfo.cs +++ b/src/Tes.Runner/Transfer/UploadInfo.cs @@ -3,5 +3,5 @@ namespace Tes.Runner.Transfer { - public record UploadInfo(string FullFilePath, Uri TargetUri); + public record UploadInfo(string FullFilePath, Uri TargetUri, string? MountParentDirectory = null); } diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index 3cfc75948..c7daad3bd 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -332,8 +332,6 @@ async ValueTask> AddProcessLogsIfAvailable(Tes.Runner.Events static IEnumerable GetFileLogs(IDictionary eventData) { - const string marker = "/wd/"; - if (eventData is null) { yield break; @@ -342,22 +340,10 @@ async ValueTask> AddProcessLogsIfAvailable(Tes.Runner.Events var numberOfFiles = int.Parse(eventData["numberOfFiles"]); for (var i = 0; i < numberOfFiles; ++i) { - var nodePath = eventData[$"filePath-{i}"]; - var idxStart = nodePath.IndexOf(marker); - - if (idxStart > 0) - { - var containerPathUnderRoot = nodePath[(idxStart + marker.Length)..]; - var idxDirectory = containerPathUnderRoot.IndexOf('/'); - - if (idxDirectory > 0) - { - yield return new( - new Azure.Storage.Blobs.BlobUriBuilder(new Uri(eventData[$"fileUri-{i}"])) { Sas = null, Query = null }.ToUri(), - $"/{containerPathUnderRoot}", - long.Parse(eventData[$"fileSize-{i}"])); - } - } + yield return new( + new Uri(eventData[$"fileUri-{i}"]), + eventData[$"filePath-{i}"], + long.Parse(eventData[$"fileSize-{i}"])); } } From 6f44d99b184845f6a5038a1ba3937c00a692a212 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 9 Nov 2023 18:48:01 -0800 Subject: [PATCH 079/202] Eventify compute node failure conditions --- src/Tes/Models/TesTaskExtended.cs | 8 +- src/TesApi.Tests/BatchSchedulerTests.cs | 58 +++-- src/TesApi.Web/AzureBatchTaskState.cs | 2 +- src/TesApi.Web/AzureProxy.cs | 36 +-- src/TesApi.Web/BatchPool.cs | 168 +++++++------ src/TesApi.Web/BatchPoolService.cs | 222 ------------------ src/TesApi.Web/BatchScheduler.cs | 105 ++++++--- .../CachingWithRetriesAzureProxy.cs | 6 +- .../BasicTesTaskContractResolver.cs | 1 - .../FullTesTaskContractResolver.cs | 1 - .../SystemThreadingTaskExtensions.cs | 2 +- src/TesApi.Web/IAzureProxy.cs | 4 +- src/TesApi.Web/IBatchPool.cs | 12 +- src/TesApi.Web/IBatchScheduler.cs | 21 +- .../OrchestrateOnBatchSchedulerServiceBase.cs | 4 +- src/TesApi.Web/PoolScheduler.cs | 166 +++++++++++++ .../{TesTaskTask.cs => RelatedTask.cs} | 7 +- src/TesApi.Web/Startup.cs | 4 +- .../{Scheduler.cs => TaskScheduler.cs} | 35 +-- 19 files changed, 421 insertions(+), 441 deletions(-) delete mode 100644 src/TesApi.Web/BatchPoolService.cs create mode 100644 src/TesApi.Web/PoolScheduler.cs rename src/TesApi.Web/{TesTaskTask.cs => RelatedTask.cs} (84%) rename src/TesApi.Web/{Scheduler.cs => TaskScheduler.cs} (85%) diff --git a/src/Tes/Models/TesTaskExtended.cs b/src/Tes/Models/TesTaskExtended.cs index 9d8954cf1..b2ed518fe 100644 --- a/src/Tes/Models/TesTaskExtended.cs +++ b/src/Tes/Models/TesTaskExtended.cs @@ -30,12 +30,6 @@ public partial class TesTask : RepositoryItem [DataMember(Name = "error_count")] public int ErrorCount { get; set; } - /// - /// Boolean of whether deletion after termination is required. - /// - [DataMember(Name = "is_delete_required")] - public bool IsTaskDeletionRequired { get; set; } - /// /// Date + time the task was completed, in RFC 3339 format. This is set by the system, not the client. /// @@ -97,7 +91,7 @@ public partial class TesTask : RepositoryItem /// public bool IsActiveState() { - return ActiveStates.Contains(this.State) || IsTaskDeletionRequired; + return ActiveStates.Contains(this.State); } } } diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 658fdae9a..2263b33d9 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -899,33 +899,45 @@ public async Task TaskGetsCancelled() GuardAssertsWithTesTask(tesTask, () => { Assert.AreEqual(TesState.CANCELEDEnum, tesTask.State); - Assert.IsTrue(tesTask.IsTaskDeletionRequired); azureProxy.Verify(i => i.TerminateBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); }); } - [TestMethod] - public async Task CancelledTaskGetsDeleted() - { - var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELEDEnum, IsTaskDeletionRequired = true, Logs = new() { new() { StartTime = DateTimeOffset.UtcNow - TimeSpan.FromMinutes(11) } } }; - - var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - azureProxyReturnValues.BatchTaskState = BatchTaskStates.Terminated; - Mock azureProxy = default; - var azureProxySetter = new Action>(mock => - { - GetMockAzureProxy(azureProxyReturnValues)(mock); - azureProxy = mock; - }); - - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, azureProxyReturnValues); - - GuardAssertsWithTesTask(tesTask, () => - { - Assert.IsFalse(tesTask.IsTaskDeletionRequired); - azureProxy.Verify(i => i.DeleteBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); - }); - } + //[TestMethod] + //public async Task CancelledTaskGetsDeleted() + //{ + // var tesTask = new TesTask + // { + // Id = "test", PoolId = "pool1", State = TesState.CANCELEDEnum, Logs = new() + // { + // new() + // { + // StartTime = DateTimeOffset.UtcNow - TimeSpan.FromMinutes(11), Logs = new() + // { + // new() { IsCloudTaskDeletionRequired = true, TaskId = "cloudTest" } + // } + // } + // } + // }; + + // var azureProxyReturnValues = AzureProxyReturnValues.Defaults; + // azureProxyReturnValues.BatchTaskState = BatchTaskStates.Terminated; + // Mock azureProxy = default; + // var azureProxySetter = new Action>(mock => + // { + // GetMockAzureProxy(azureProxyReturnValues)(mock); + // azureProxy = mock; + // }); + + // _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, azureProxyReturnValues); + + // GuardAssertsWithTesTask(tesTask, () => + // { + // var executorLog = tesTask.Logs.Last().Logs.Last(); + // Assert.IsFalse(executorLog.IsCloudTaskDeletionRequired); + // azureProxy.Verify(i => i.DeleteBatchTaskAsync(executorLog.TaskId, It.IsAny(), It.IsAny())); + // }); + //} [TestMethod] public async Task SuccessfullyCompletedTaskContainsBatchNodeMetrics() diff --git a/src/TesApi.Web/AzureBatchTaskState.cs b/src/TesApi.Web/AzureBatchTaskState.cs index e836faf7f..74d9c916d 100644 --- a/src/TesApi.Web/AzureBatchTaskState.cs +++ b/src/TesApi.Web/AzureBatchTaskState.cs @@ -10,7 +10,7 @@ namespace TesApi.Web { /// - /// Combined state of the progression of an attempt to run a + /// Snapshot state of the progression of an attempt to run a /// /// Task state. Also see . /// File details after the task has completed successfully, for logging purposes. diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 090df2a45..ad48ee989 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -310,38 +310,12 @@ public async Task TerminateBatchTaskAsync(string tesTaskId, string jobId, Cancel } /// - public async Task DeleteBatchTaskAsync(string tesTaskId, string jobId, CancellationToken cancellationToken) + public async Task DeleteBatchTaskAsync(string cloudTaskId, string jobId, CancellationToken cancellationToken) { - var jobFilter = new ODATADetailLevel - { - FilterClause = $"startswith(id,'{tesTaskId}{BatchJobAttemptSeparator}')", - SelectClause = "id" - }; - - List batchTasksToDelete = default; - - try - { - batchTasksToDelete = await batchClient.JobOperations.ListTasks(jobId, jobFilter).ToAsyncEnumerable().ToListAsync(cancellationToken); - } - catch (BatchException ex) when (ex.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException bee && "JobNotFound".Equals(bee.Body?.Code, StringComparison.InvariantCultureIgnoreCase)) - { - logger.LogWarning("Job not found for TES task {TesTask}", tesTaskId); - return; // Task cannot exist if the job is not found. - } - - if (batchTasksToDelete.Count > 1) - { - logger.LogWarning("Found more than one active task for TES task {TesTask}", tesTaskId); - } - - foreach (var task in batchTasksToDelete) - { - logger.LogInformation("Deleting task {BatchTask}", task.Id); - var ctx = new Context(); - ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenNodeNotReady(LogRetryErrorOnRetryHandler())); - await batchRetryPolicy.ExecuteAsync((_, ct) => task.DeleteAsync(cancellationToken: ct), ctx, cancellationToken); - } + logger.LogInformation("Deleting task {BatchTask}", cloudTaskId); + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenNodeNotReady(LogRetryErrorOnRetryHandler())); + await batchRetryPolicy.ExecuteAsync((_, ct) => batchClient.JobOperations.DeleteTaskAsync(jobId, cloudTaskId, cancellationToken: ct), ctx, cancellationToken); } /// diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 26b6713a3..c34c8173d 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -34,7 +34,6 @@ public sealed partial class BatchPool private readonly ILogger _logger; private readonly IAzureProxy _azureProxy; - private readonly Storage.IStorageAccessProvider _storageAccessProvider; /// /// Constructor of . @@ -43,11 +42,9 @@ public sealed partial class BatchPool /// /// /// - /// /// - public BatchPool(IBatchScheduler batchScheduler, IOptions batchSchedulingOptions, IAzureProxy azureProxy, ILogger logger, Storage.IStorageAccessProvider storageAccessProvider) + public BatchPool(IBatchScheduler batchScheduler, IOptions batchSchedulingOptions, IAzureProxy azureProxy, ILogger logger) { - _storageAccessProvider = storageAccessProvider; var rotationDays = batchSchedulingOptions.Value.PoolRotationForcedDays; if (rotationDays == 0) { rotationDays = Options.BatchSchedulingOptions.DefaultPoolRotationForcedDays; } _forcePoolRotationAge = TimeSpan.FromDays(rotationDays); @@ -310,27 +307,6 @@ public static string AutoPoolFormula(bool preemptable, int initialTarget) private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken cancellationToken) { - var nodeList = await GetNodesToRemove(false).ToDictionaryAsync(node => node.Id, cancellationToken: cancellationToken); - await foreach (var task in _azureProxy.ListTasksAsync(Id, new ODATADetailLevel { SelectClause = "id,executionInfo,nodeInfo" }).WithCancellation(cancellationToken)) - { - var nodeId = task.ComputeNodeInformation?.ComputeNodeId; - if (nodeId is not null && nodeList.ContainsKey(nodeId)) - { - await _azureProxy.UploadBlobAsync( - new Uri(await _storageAccessProvider.GetInternalTesBlobUrlAsync( - $"nodeError/{nodeId}/{task.Id}-{new Guid():N}", - Azure.Storage.Sas.BlobSasPermissions.Create, - cancellationToken)), - System.Text.Json.JsonSerializer.Serialize(task.ExecutionInformation, - new System.Text.Json.JsonSerializerOptions() - { - DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingDefault, - Converters = { new System.Text.Json.Serialization.JsonStringEnumConverter(System.Text.Json.JsonNamingPolicy.CamelCase) } - }), - cancellationToken); - } - } - // This method implememts a state machine to disable/enable autoscaling as needed to clear certain conditions that can be observed // Inputs are _resetAutoScalingRequired, compute nodes in ejectable states, and the current _scalingMode, along with the pool's // allocation state and autoscale enablement. @@ -368,9 +344,6 @@ await _azureProxy.UploadBlobAsync( { case ComputeNodeState.Unusable: _logger.LogDebug("Found unusable node {NodeId}", node.Id); - await SendNodeTaskInformation(node.Id, node.RecentTasks); - //node.RecentTasks[0].ExecutionInformation.FailureInformation.Code == TaskFailureInformationCodes.DiskFull - // TODO: notify running tasks that task will switch nodes? break; case ComputeNodeState.StartTaskFailed: @@ -380,14 +353,10 @@ await _azureProxy.UploadBlobAsync( case ComputeNodeState.Preempted: _logger.LogDebug("Found preempted node {NodeId}", node.Id); - await SendNodeTaskInformation(node.Id, node.RecentTasks); - //node.RecentTasks[0].TaskId - //node.RecentTasks[0].ExecutionInformation.FailureInformation.Category == ErrorCategory.ServerError - // TODO: notify running tasks that task will switch nodes? Or, in the future, terminate the task? break; - default: // Should never reach here. Skip. - continue; + default: + throw new System.Diagnostics.UnreachableException($"Unexpected compute node state '{node.State}' received while looking for nodes to remove from the pool."); } nodesToRemove = nodesToRemove.Append(node); @@ -412,7 +381,7 @@ await _azureProxy.UploadBlobAsync( _scalingMode = ScalingMode.RemovingFailedNodes; _logger.LogInformation(@"Switching pool {PoolId} back to autoscale.", Id); await _azureProxy.EnableBatchPoolAutoScaleAsync(Id, !IsDedicated, AutoScaleEvaluationInterval, AutoPoolFormula, GetTaskCountAsync, cancellationToken); - _autoScaleWaitTime = DateTime.UtcNow + (3 * AutoScaleEvaluationInterval) + (BatchPoolService.RunInterval / 2); + _autoScaleWaitTime = DateTime.UtcNow + (3 * AutoScaleEvaluationInterval) + (PoolScheduler.RunInterval / 2); _scalingMode = _resetAutoScalingRequired ? ScalingMode.WaitingForAutoScale : ScalingMode.SettingAutoScale; _resetAutoScalingRequired = false; break; @@ -444,32 +413,7 @@ async ValueTask GetTaskCountAsync(int @default) // Used to make reenabling } IAsyncEnumerable GetNodesToRemove(bool withState) - => _azureProxy.ListComputeNodesAsync(Id, new ODATADetailLevel(filterClause: @"state eq 'starttaskfailed' or state eq 'preempted' or state eq 'unusable'", selectClause: withState ? @"id,recentTasks,state,startTaskInfo" : @"id")); - - async Task SendNodeTaskInformation(string nodeId, IReadOnlyList content) - { - var url = new Uri(await _storageAccessProvider.GetInternalTesBlobUrlAsync( - $"nodeError/{nodeId}-{new Guid():N}", - Azure.Storage.Sas.BlobSasPermissions.Create, - cancellationToken)); - - if (content is null || content!.Any()) - { - await _azureProxy.UploadBlobAsync(url, "No recent tasks found on node.", cancellationToken); - } - else - { - await _azureProxy.UploadBlobAsync( - url, - System.Text.Json.JsonSerializer.Serialize(content, - new System.Text.Json.JsonSerializerOptions() - { - DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingDefault, - Converters = { new System.Text.Json.Serialization.JsonStringEnumConverter(System.Text.Json.JsonNamingPolicy.CamelCase) } - }), - cancellationToken); - } - } + => _azureProxy.ListComputeNodesAsync(Id, new ODATADetailLevel(filterClause: @"state eq 'starttaskfailed' or state eq 'preempted' or state eq 'unusable'", selectClause: withState ? @"id,state,startTaskInfo" : @"id")); } private bool DetermineIsAvailable(DateTime? creation) @@ -680,31 +624,87 @@ async ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken ca } /// - public IAsyncEnumerable GetTaskResizeFailuresAsync(CancellationToken cancellationToken) + public IAsyncEnumerable GetTasksToDelete(CancellationToken cancellationToken) { - return GetTasksAsync("id", "state eq 'active'").Zip( - GetFailures(cancellationToken), - (cloud, state) => new CloudTaskBatchTaskState(cloud.Id, state)); + return GetTasksAsync("creationTime,id", $"state eq 'completed' and creationTime lt datetime'{DateTimeOffset.UtcNow - TimeSpan.FromMinutes(10):O}'").Select(task => new IBatchScheduler.CloudTaskId(Id, task.Id, task.CreationTime.Value)); + } + + /// + public async IAsyncEnumerable GetCloudTaskStatesAsync([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + { + List taskListWithComputeNodeInfo; // To check if the task was running when its node became preempted or unusable + List activeTaskList; // These are candidates to be the victim of resizes or starttask failures + List completedTaskList; // Backstop if events don't provide timely task completion information in a timely manner + + { + var taskList = await GetTasksAsync("executionInfo,id,nodeInfo,state,stateTransitionTime", null).ToListAsync(cancellationToken); + taskListWithComputeNodeInfo = taskList.Where(task => !TaskState.Completed.Equals(task.State) && !string.IsNullOrEmpty(task.ComputeNodeInformation?.ComputeNodeId)).ToList(); + activeTaskList = taskList.Where(task => TaskState.Active.Equals(task.State)).OrderByDescending(task => task.StateTransitionTime).ToList(); + completedTaskList = taskList.Where(task => TaskState.Completed.Equals(task.State) && task.StateTransitionTime < DateTime.UtcNow - TimeSpan.FromMinutes(2)).ToList(); + } + + await foreach (var node in _azureProxy.ListComputeNodesAsync(Id, new ODATADetailLevel(filterClause: @"state eq 'preempted' or state eq 'unusable'", selectClause: @"errors,id,state")).WithCancellation(cancellationToken)) + { + foreach (var task in taskListWithComputeNodeInfo.Where(task => node.Id.Equals(task.ComputeNodeInformation.ComputeNodeId, StringComparison.InvariantCultureIgnoreCase))) + { + yield return new(task.Id, node.State switch + { + ComputeNodeState.Preempted => new(AzureBatchTaskState.TaskState.NodePreempted), + ComputeNodeState.Unusable => new(AzureBatchTaskState.TaskState.NodeUnusable, Failure: ParseComputeNodeErrors(node.Errors)), + _ => throw new System.Diagnostics.UnreachableException(), + }); + + _ = activeTaskList.Remove(task); + } + } + + await foreach (var state in activeTaskList.ToAsyncEnumerable().Zip(GetFailures(cancellationToken), + (cloud, state) => new CloudTaskBatchTaskState(cloud.Id, state)) + .WithCancellation(cancellationToken)) + { + yield return state; + } + + foreach (var task in completedTaskList) + { + yield return new(task.Id, GetCompletedBatchState(task)); + } + + yield break; + + static AzureBatchTaskState.FailureInformation ParseComputeNodeErrors(IReadOnlyList nodeErrors) + { + var totalList = nodeErrors.Select(nodeError => Enumerable.Empty().Append(nodeError.Code).Append(nodeError.Message) + .Concat(nodeError.ErrorDetails.Select(errorDetail => Enumerable.Empty().Append(errorDetail.Name).Append(errorDetail.Value)).SelectMany(s => s))) + .SelectMany(s => s).ToList(); + + if (totalList.Contains(TaskFailureInformationCodes.DiskFull)) + { + return new(TaskFailureInformationCodes.DiskFull, totalList); + } + else + { + return new(BatchErrorCodeStrings.NodeStateUnusable, totalList); + } + } #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously async IAsyncEnumerable GetFailures([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously { - cancellationToken.ThrowIfCancellationRequested(); - for (var failure = PopNextStartTaskFailure(); failure is not null; failure = PopNextStartTaskFailure()) { yield return ConvertFromStartTask(failure); cancellationToken.ThrowIfCancellationRequested(); } - cancellationToken.ThrowIfCancellationRequested(); - for (var failure = PopNextResizeError(); failure is not null; failure = PopNextResizeError()) { yield return ConvertFromResize(failure); cancellationToken.ThrowIfCancellationRequested(); } + + yield break; } AzureBatchTaskState ConvertFromResize(ResizeError failure) @@ -723,11 +723,33 @@ ResizeError PopNextResizeError() TaskFailureInformation PopNextStartTaskFailure() => StartTaskFailures.TryDequeue(out var failure) ? failure : default; - } - /// - public IAsyncEnumerable GetCompletedTasksAsync(CancellationToken _1) - => GetTasksAsync("executionInfo,id", $"state eq 'completed' and stateTransitionTime lt DateTime'{DateTime.UtcNow - TimeSpan.FromMinutes(2):O}'"); + AzureBatchTaskState GetCompletedBatchState(CloudTask task) + { + _logger.LogDebug("Getting batch task state from completed task {TesTask}.", _batchPools.GetTesTaskIdFromCloudTaskId(task.Id)); + return task.ExecutionInformation.Result switch + { + TaskExecutionResult.Success => new( + AzureBatchTaskState.TaskState.CompletedSuccessfully, + BatchTaskStartTime: task.ExecutionInformation.StartTime, + BatchTaskEndTime: task.ExecutionInformation.EndTime, + BatchTaskExitCode: task.ExecutionInformation.ExitCode), + + TaskExecutionResult.Failure => new( + AzureBatchTaskState.TaskState.CompletedWithErrors, + Failure: new(task.ExecutionInformation.FailureInformation.Code, + Enumerable.Empty() + .Append(task.ExecutionInformation.FailureInformation.Message) + .Append($"Batch task ExitCode: {task.ExecutionInformation?.ExitCode}, Failure message: {task.ExecutionInformation?.FailureInformation?.Message}") + .Concat(task.ExecutionInformation.FailureInformation.Details.Select(pair => pair.Value))), + BatchTaskStartTime: task.ExecutionInformation.StartTime, + BatchTaskEndTime: task.ExecutionInformation.EndTime, + BatchTaskExitCode: task.ExecutionInformation.ExitCode), + + _ => throw new System.Diagnostics.UnreachableException(), + }; + } + } /// public async ValueTask GetAllocationStateTransitionTimeAsync(CancellationToken cancellationToken = default) diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs deleted file mode 100644 index 88706250b..000000000 --- a/src/TesApi.Web/BatchPoolService.cs +++ /dev/null @@ -1,222 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System; -using System.Collections.Concurrent; -using System.Collections.Generic; -using System.Linq; -using System.Threading; -using System.Threading.Tasks; -using Microsoft.Azure.Batch; -using Microsoft.Extensions.Logging; -using Tes.Models; -using Tes.Repository; -using static TesApi.Web.IBatchPool; - -namespace TesApi.Web -{ - /// - /// A background service that montitors CloudPools in the batch system, orchestrates their lifecycle, and updates their state. - /// This should only be used as a system-wide singleton service. This class does not support scale-out on multiple machines, - /// nor does it implement a leasing mechanism. In the future, consider using the Lease Blob operation. - /// - internal class BatchPoolService : OrchestrateOnBatchSchedulerServiceBase - { - /// - /// Interval between each call to . - /// - public static readonly TimeSpan RunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs uses a 30 second polling interval - public static readonly TimeSpan CompletedCloudTasksRunInterval = TimeSpan.FromSeconds(90); - - /// - /// Default constructor - /// - /// Used for requesting termination of the current application during initialization. - /// The main TES task database repository implementation - /// - /// - /// - public BatchPoolService(Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger logger) - : base(hostApplicationLifetime, repository, batchScheduler, logger) { } - - /// - protected override void ExecuteSetup(CancellationToken stoppingToken) - { - batchScheduler.LoadExistingPoolsAsync(stoppingToken).Wait(stoppingToken); // Delay starting Scheduler until this completes to finish initializing BatchScheduler. - } - - /// - protected override Task ExecuteCoreAsync(CancellationToken stoppingToken) - { - return Task.WhenAll(ServiceBatchPoolsAsync(stoppingToken), ExecuteCompletedTesTasksOnBatchAsync(stoppingToken)); - } - - /// - /// Performs an action on each batch pool. - /// - /// - /// - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. - /// - private async ValueTask ExecuteActionOnPoolsAsync(string pollName, Func action, CancellationToken stoppingToken) - { - ArgumentNullException.ThrowIfNull(action); - - var pools = batchScheduler.GetPools().ToList(); - - if (0 == pools.Count) - { - return; - } - - var startTime = DateTime.UtcNow; - - await Parallel.ForEachAsync(pools, stoppingToken, async (pool, token) => - { - try - { - await action(pool, token); - } - catch (Exception exc) - { - logger.LogError(exc, @"Batch pool {PoolId} threw an exception in {Poll}.", pool.Id, pollName); - } - }); - - logger.LogDebug(@"{Poll} for {PoolsCount} pools completed in {TotalSeconds} seconds.", pollName, pools.Count, DateTime.UtcNow.Subtract(startTime).TotalSeconds); - } - - /// - /// Calls repeatedly. - /// - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. - /// - private Task ServiceBatchPoolsAsync(CancellationToken stoppingToken) - { - return ExecuteActionOnIntervalAsync(RunInterval, ExecuteServiceBatchPoolsAsync, stoppingToken); - } - - /// - /// Retrieves all batch pools from the database and affords an opportunity to react to changes. - /// - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. - /// - private async ValueTask ExecuteServiceBatchPoolsAsync(CancellationToken stoppingToken) - { - var list = new ConcurrentBag<(TesTask TesTask, AzureBatchTaskState State)>(); - - await ExecuteActionOnPoolsAsync( - "Service Batch Pools", - async (pool, token) => - { - await pool.ServicePoolAsync(token); - await ProcessFailures(pool.GetTaskResizeFailuresAsync(token), token); - }, - stoppingToken); - - if (list.IsEmpty) - { - return; - } - - await OrchestrateTesTasksOnBatchAsync( - "Failures", -#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously - async _ => list.Select(t => t.TesTask).ToAsyncEnumerable(), -#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously - (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, list.Select(t => t.State).ToArray(), token), - stoppingToken); - - async ValueTask ProcessFailures(IAsyncEnumerable failures, CancellationToken cancellationToken) - { - await foreach (var (cloudTaskId, state) in failures.WithCancellation(cancellationToken)) - { - TesTask tesTask = default; - if (await repository.TryGetItemAsync(batchScheduler.GetTesTaskIdFromCloudTaskId(cloudTaskId), cancellationToken, task => tesTask = task) && tesTask is not null) - { - list.Add((tesTask, state)); - } - } - } - } - - /// - /// Calls repeatedly. - /// - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. - /// - private Task ExecuteCompletedTesTasksOnBatchAsync(CancellationToken stoppingToken) - { - return ExecuteActionOnIntervalAsync(CompletedCloudTasksRunInterval, ProcessCompletedCloudTasksAsync, stoppingToken); - } - - /// - /// Retrieves all completed tasks from every batch pools from the database and affords an opportunity to react to changes. - /// - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. - /// - private async ValueTask ProcessCompletedCloudTasksAsync(CancellationToken stoppingToken) - { - var tasks = new ConcurrentBag(); - - await ExecuteActionOnPoolsAsync("Service Batch Tasks", async (pool, token) => await pool.GetCompletedTasksAsync(token).ForEachAsync(tasks.Add, token), stoppingToken); - - if (tasks.IsEmpty) - { - return; - } - - await OrchestrateTesTasksOnBatchAsync( - "Completed", -#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously - async cancellationToken => GetTesTasks(cancellationToken), -#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously - (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, tasks.Select(GetCompletedBatchState).ToArray(), token), - stoppingToken); - - async IAsyncEnumerable GetTesTasks([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) - { - foreach (var tesTaskId in tasks.Select(t => batchScheduler.GetTesTaskIdFromCloudTaskId(t.Id))) - { - TesTask tesTask = default; - if (await repository.TryGetItemAsync(tesTaskId, cancellationToken, task => tesTask = task) && tesTask is not null) - { - logger.LogDebug("Completing task {TesTask}.", tesTask.Id); - yield return tesTask; - } - else - { - logger.LogDebug("Could not find completed task {TesTask}.", tesTaskId); - yield return null; - } - } - } - - AzureBatchTaskState GetCompletedBatchState(CloudTask task) - { - logger.LogDebug("Getting batch task state from completed task {TesTask}.", batchScheduler.GetTesTaskIdFromCloudTaskId(task.Id)); - return task.ExecutionInformation.Result switch - { - Microsoft.Azure.Batch.Common.TaskExecutionResult.Success => new( - AzureBatchTaskState.TaskState.CompletedSuccessfully, - BatchTaskStartTime: task.ExecutionInformation.StartTime, - BatchTaskEndTime: task.ExecutionInformation.EndTime, - BatchTaskExitCode: task.ExecutionInformation.ExitCode), - - Microsoft.Azure.Batch.Common.TaskExecutionResult.Failure => new( - AzureBatchTaskState.TaskState.CompletedWithErrors, - Failure: new(task.ExecutionInformation.FailureInformation.Code, - Enumerable.Empty() - .Append(task.ExecutionInformation.FailureInformation.Message) - .Append($"Batch task ExitCode: {task.ExecutionInformation?.ExitCode}, Failure message: {task.ExecutionInformation?.FailureInformation?.Message}") - .Concat(task.ExecutionInformation.FailureInformation.Details.Select(pair => pair.Value))), - BatchTaskStartTime: task.ExecutionInformation.StartTime, - BatchTaskEndTime: task.ExecutionInformation.EndTime, - BatchTaskExitCode: task.ExecutionInformation.ExitCode), - - _ => throw new System.Diagnostics.UnreachableException(), - }; - } - } - } -} diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 8ec1ab4ed..37a3f3ebe 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -15,12 +15,14 @@ using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Tes.Extensions; +using Tes.Models; using TesApi.Web.Events; using TesApi.Web.Extensions; using TesApi.Web.Management; using TesApi.Web.Management.Models.Quotas; using TesApi.Web.Runner; using TesApi.Web.Storage; +using static TesApi.Web.IBatchScheduler; using BatchModels = Microsoft.Azure.Management.Batch.Models; using TesException = Tes.Models.TesException; using TesFileType = Tes.Models.TesFileType; @@ -174,7 +176,6 @@ public BatchScheduler( static bool tesTaskIsInitializingOrRunning(TesTask tesTask) => tesTask.State == TesState.INITIALIZINGEnum || tesTask.State == TesState.RUNNINGEnum; static bool tesTaskIsQueuedOrInitializing(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum || tesTask.State == TesState.INITIALIZINGEnum; static bool tesTaskIsQueued(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum; - static bool tesTaskDeletionReady(TesTask tesTask) => tesTask.IsTaskDeletionRequired; var setTaskStateLock = new object(); @@ -282,25 +283,25 @@ Tes.Models.TesOutputFileLog ConvertOutputFileLogToTesOutputFileLog(AzureBatchTas async Task SetTaskCompleted(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + await TerminateBatchTaskAsync(tesTask, batchInfo, cancellationToken); return await SetTaskStateAndLog(tesTask, TesState.COMPLETEEnum, batchInfo, cancellationToken); } async Task SetTaskExecutorError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + await TerminateBatchTaskAsync(tesTask, batchInfo, cancellationToken); return await SetTaskStateAndLog(tesTask, TesState.EXECUTORERROREnum, batchInfo, cancellationToken); } async Task SetTaskSystemError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + await TerminateBatchTaskAsync(tesTask, batchInfo, cancellationToken); return await SetTaskStateAndLog(tesTask, TesState.SYSTEMERROREnum, batchInfo, cancellationToken); } async Task SetTaskStateAfterFailureAsync(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + await TerminateBatchTaskAsync(tesTask, batchInfo, cancellationToken); return await SetTaskStateAndLog(tesTask, newTaskState, batchInfo, cancellationToken); } @@ -314,11 +315,17 @@ Task AddSystemLogAndSetTaskSystemErrorAsync(TesTask tesTask, CombinedBatch return SetTaskSystemError(tesTask, new(batchInfo, additionalSystemLogItem), cancellationToken); } - Task HandlePreemptedNodeAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) + bool HandlePreemptedNode(TesTask tesTask, CombinedBatchTaskInfo batchInfo) { - logger.LogInformation("The TesTask {TesTask}'s node was preempted. It will be automatically rescheduled.", tesTask.Id); + // TODO: Keep track of the number of times Azure Batch retried this task and fail it as preempted if it is too many times. Waiting on Cromwell to support preempted tasks to do this. + var oldLog = tesTask.GetOrAddTesTaskLog(); + var newLog = tesTask.AddTesTaskLog(); + oldLog.Warning = "ComputeNode was preempted. The task will be automatically rescheduled."; + newLog.VirtualMachineInfo = oldLog.VirtualMachineInfo; + newLog.StartTime = DateTimeOffset.UtcNow; tesTask.State = TesState.INITIALIZINGEnum; - return Task.FromResult(false); + logger.LogInformation("The TesTask {TesTask}'s node was preempted. It will be automatically rescheduled.", tesTask.Id); + return true; } Task HandleInfoUpdate(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) @@ -328,10 +335,7 @@ Task HandleInfoUpdate(TesTask tesTask, CombinedBatchTaskInfo batchInfo, Ca tesTaskStateTransitions = new List() { - new TesTaskStateTransition(tesTaskDeletionReady, AzureBatchTaskState.TaskState.CancellationRequested, alternateSystemLogItem: null, (tesTask, _, ct) => DeleteCancelledTaskAsync(tesTask, ct)), - new TesTaskStateTransition(condition: null, AzureBatchTaskState.TaskState.CancellationRequested, alternateSystemLogItem: null, (tesTask, _, ct) => TerminateBatchTaskAsync(tesTask, ct)), - //new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.JobNotFound, alternateSystemLogItem: null, (tesTask, _, ct) => AddBatchTaskAsync(tesTask, ct)), - //new TesTaskStateTransition(tesTaskIsQueued, BatchTaskState.MissingBatchTask, alternateSystemLogItem: null, (tesTask, _, ct) => AddBatchTaskAsync(tesTask, ct)), + new TesTaskStateTransition(condition: null, AzureBatchTaskState.TaskState.CancellationRequested, alternateSystemLogItem: null, TerminateBatchTaskAsync), new TesTaskStateTransition(tesTaskIsQueued, AzureBatchTaskState.TaskState.Initializing, alternateSystemLogItem: null, (tesTask, _) => { tesTask.State = TesState.INITIALIZINGEnum; return true; }), new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, AzureBatchTaskState.TaskState.NodeAllocationFailed, alternateSystemLogItem: null, RequeueTaskAfterFailureAsync), new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.Running, alternateSystemLogItem: null, (tesTask, info, ct) => SetTaskStateAndLog(tesTask, TesState.RUNNINGEnum, info, ct)), @@ -343,44 +347,80 @@ Task HandleInfoUpdate(TesTask tesTask, CombinedBatchTaskInfo batchInfo, Ca new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.NodeUnusable, "Please open an issue. There should have been an error reported here.", SetTaskExecutorError), //new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.JobNotFound, BatchTaskState.JobNotFound.ToString(), SetTaskSystemError), //new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.MissingBatchTask, BatchTaskState.MissingBatchTask.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), - new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodePreempted, alternateSystemLogItem: null, HandlePreemptedNodeAsync), + new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodePreempted, alternateSystemLogItem: null, HandlePreemptedNode), new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed, alternateSystemLogItem: null, HandleInfoUpdate), new TesTaskStateTransition(condition: null, AzureBatchTaskState.TaskState.InfoUpdate, alternateSystemLogItem: null, HandleInfoUpdate), }.AsReadOnly(); } - private async Task DeleteCancelledTaskAsync(TesTask tesTask, CancellationToken cancellationToken) + private async Task DeleteTerminatedTaskAsync(string taskId, string jobId, DateTime taskCreated, CancellationToken cancellationToken) { // https://learn.microsoft.com/azure/batch/best-practices#manage-task-lifetime var mins10 = TimeSpan.FromMinutes(10); var now = DateTimeOffset.UtcNow; - if (!tesTask.Logs.Any(l => now - l.StartTime > mins10)) + if (!(now - taskCreated > mins10)) { return false; } - await azureProxy.DeleteBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - tesTask.IsTaskDeletionRequired = false; - await taskExecutionScriptingManager.TryUploadServerTesTask(tesTask, "server-tes-task-completed.json", cancellationToken); - return true; + try + { + await azureProxy.DeleteBatchTaskAsync(taskId, jobId, cancellationToken); + return true; + } + catch (BatchException exc) when (BatchErrorCodeStrings.TaskNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) + { + return true; + } } - private async Task TerminateBatchTaskAsync(TesTask tesTask, CancellationToken cancellationToken) + private async Task TerminateBatchTaskAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { try { + switch (batchInfo.State) + { + case AzureBatchTaskState.TaskState.CompletedSuccessfully: + case AzureBatchTaskState.TaskState.CompletedWithErrors: + return false; // Let it finish on its own + + + case AzureBatchTaskState.TaskState.CancellationRequested: + if (!TesTask.ActiveStates.Contains(tesTask.State)) + { + return false; // It probably already finished on its own + } + + tesTask.State = TesState.CANCELEDEnum; + goto default; + + default: + break; + } + await azureProxy.TerminateBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - tesTask.IsTaskDeletionRequired = true; - tesTask.State = TesState.CANCELEDEnum; - return true; + //return true; + } + catch (BatchException exc) when (BatchErrorCodeStrings.TaskNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) + { + //return true; } - //TODO: catch exception returned if the task was already completed. catch (Exception exc) { logger.LogError(exc, "Exception terminating batch task with tesTask.Id: {TesTaskId}", tesTask?.Id); throw; } + + try + { + await taskExecutionScriptingManager.TryUploadServerTesTask(tesTask, "server-tes-task-completed.json", cancellationToken); + } + catch (Exception exc) + { + logger.LogError(exc, "Exception uploading task with tesTask.Id: {TesTaskId}", tesTask?.Id); + } + return true; } /// @@ -440,21 +480,26 @@ public async Task UploadTaskRunnerIfNeeded(CancellationToken cancellationToken) } /// - public IAsyncEnumerable> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken) + public IAsyncEnumerable> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken) { ArgumentNullException.ThrowIfNull(tesTasks); ArgumentNullException.ThrowIfNull(taskStates); return taskStates.Zip(tesTasks, (TaskState, TesTask) => (TaskState, TesTask)) .Where(entry => entry.TesTask?.IsActiveState() ?? false) // Removes already terminal (and null) TesTasks from being further processed. - .Select(entry => new TesTaskTask(WrapHandleTesTaskTransitionAsync(entry.TesTask, entry.TaskState, cancellationToken), entry.TesTask)) + .Select(entry => new RelatedTask(WrapHandleTesTaskTransitionAsync(entry.TesTask, entry.TaskState, cancellationToken), entry.TesTask)) .WhenEach(cancellationToken, tesTaskTask => tesTaskTask.Task); async Task WrapHandleTesTaskTransitionAsync(TesTask tesTask, AzureBatchTaskState azureBatchTaskState, CancellationToken cancellationToken) => await HandleTesTaskTransitionAsync(tesTask, azureBatchTaskState, cancellationToken); + } - //Task WrapHandleTesTaskTransitionAsync(TesTask tesTask, AzureBatchTaskState azureBatchTaskState, CancellationToken cancellationToken) - // => Task.Run(async () => await HandleTesTaskTransitionAsync(tesTask, azureBatchTaskState, cancellationToken)); + /// + public IAsyncEnumerable> DeleteCloudTasksAsync(IAsyncEnumerable cloudTasks, CancellationToken cancellationToken) + { +#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously + return cloudTasks.SelectAwaitWithCancellation(async (task, cancellationToken) => new RelatedTask(DeleteTerminatedTaskAsync(task.TaskId, task.JobId, task.Created, cancellationToken), task)); +#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously } private static string GetCromwellExecutionDirectoryPathAsUrl(TesTask task) @@ -538,7 +583,7 @@ private static bool IsCromwellCommandScript(TesInput inputFile) => (inputFile.Name?.Equals("commandScript") ?? false) && (inputFile.Description?.EndsWith(".commandScript") ?? false) && inputFile.Type == TesFileType.FILEEnum && inputFile.Path.EndsWith($"/{CromwellScriptFileName}"); /// - public async IAsyncEnumerable> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + public async IAsyncEnumerable> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { var tasksMetadataByPoolKey = new Dictionary Identities, string PoolDisplayName)>>(); var poolKeyByTaskIds = new Dictionary(); // Reverse lookup of 'tasksMetadataByPoolKey' @@ -935,7 +980,7 @@ private void ValidateTesTaskInput(TesInput inputFile, TesTask tesTask) var globalStartTaskConfigured = !string.IsNullOrWhiteSpace(globalStartTaskPath); var startTaskSasUrl = globalStartTaskConfigured - ? await storageAccessProvider.MapLocalPathToSasUrlAsync(globalStartTaskPath, storageAccessProvider.DefaultBlobPermissions, cancellationToken, sasTokenDuration: BatchPoolService.RunInterval.Multiply(2).Add(poolLifetime).Add(TimeSpan.FromMinutes(15))) + ? await storageAccessProvider.MapLocalPathToSasUrlAsync(globalStartTaskPath, storageAccessProvider.DefaultBlobPermissions, cancellationToken, sasTokenDuration: PoolScheduler.RunInterval.Multiply(2).Add(poolLifetime).Add(TimeSpan.FromMinutes(15))) : default; if (startTaskSasUrl is not null) diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 111a07446..4915f4042 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -142,13 +142,13 @@ public async Task DeleteBatchJobAsync(string jobId, CancellationToken cancellati } /// - public async Task DeleteBatchTaskAsync(string tesTaskId, string jobId, CancellationToken cancellationToken) + public async Task DeleteBatchTaskAsync(string cloudTaskId, string jobId, CancellationToken cancellationToken) { try { var ctx = new Context(); ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(LogRetryErrorOnRetryHandler())); - await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(tesTaskId, jobId, ct), cancellationToken, ctx); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(cloudTaskId, jobId, ct), cancellationToken, ctx); } catch (BatchException exc) when (BatchErrorCodeStrings.TaskNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } @@ -342,7 +342,7 @@ public Task GetFullAllocationStateAsync(string poo return cachingRetryHandler.ExecuteWithRetryAndCachingAsync( $"{nameof(CachingWithRetriesAzureProxy)}:{poolId}", ct => azureProxy.GetFullAllocationStateAsync(poolId, ct), - DateTimeOffset.UtcNow.Add(BatchPoolService.RunInterval.Divide(2)), + DateTimeOffset.UtcNow.Add(PoolScheduler.RunInterval.Divide(2)), cancellationToken, ctx); } diff --git a/src/TesApi.Web/ContractResolvers/BasicTesTaskContractResolver.cs b/src/TesApi.Web/ContractResolvers/BasicTesTaskContractResolver.cs index feae42730..558a052c8 100644 --- a/src/TesApi.Web/ContractResolvers/BasicTesTaskContractResolver.cs +++ b/src/TesApi.Web/ContractResolvers/BasicTesTaskContractResolver.cs @@ -28,7 +28,6 @@ public class BasicTesTaskContractResolver : DefaultContractResolver Tuple.Create(typeof(TesExecutorLog), nameof(TesExecutorLog.Stderr)), Tuple.Create(typeof(TesInput), nameof(TesInput.Content)), Tuple.Create(typeof(TesTaskLog), nameof(TesTaskLog.SystemLogs)), - Tuple.Create(typeof(TesTask), nameof(TesTask.IsTaskDeletionRequired)), Tuple.Create(typeof(TesTask), nameof(TesTask.ErrorCount)), Tuple.Create(typeof(TesTask), nameof(TesTask.EndTime)), Tuple.Create(typeof(TesTask), nameof(TesTask.PoolId)), diff --git a/src/TesApi.Web/ContractResolvers/FullTesTaskContractResolver.cs b/src/TesApi.Web/ContractResolvers/FullTesTaskContractResolver.cs index e9c549443..15439ff64 100644 --- a/src/TesApi.Web/ContractResolvers/FullTesTaskContractResolver.cs +++ b/src/TesApi.Web/ContractResolvers/FullTesTaskContractResolver.cs @@ -19,7 +19,6 @@ public class FullTesTaskContractResolver : DefaultContractResolver // In FULL view, task message will include all fields EXCEPT custom fields added to support running TES with Cromwell on Azure private static readonly List> PropertiesToSkip = new() { - Tuple.Create(typeof(TesTask), nameof(TesTask.IsTaskDeletionRequired)), Tuple.Create(typeof(TesTask), nameof(TesTask.ErrorCount)), Tuple.Create(typeof(TesTask), nameof(TesTask.EndTime)), Tuple.Create(typeof(TesTask), nameof(TesTask.PoolId)), diff --git a/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs b/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs index c10aee2b3..0f97ea435 100644 --- a/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs +++ b/src/TesApi.Web/Extensions/SystemThreadingTaskExtensions.cs @@ -62,7 +62,7 @@ public static async IAsyncEnumerable WhenEach(this IEnumerable source, }, cancellationToken, TaskContinuationOptions.DenyChildAttach, - TaskScheduler.Default); + System.Threading.Tasks.TaskScheduler.Default); }); // Return all completed entries as their tasks are completed, no matter if by failure, cancellation, or running to completion. diff --git a/src/TesApi.Web/IAzureProxy.cs b/src/TesApi.Web/IAzureProxy.cs index 9355b0fba..70306814b 100644 --- a/src/TesApi.Web/IAzureProxy.cs +++ b/src/TesApi.Web/IAzureProxy.cs @@ -66,10 +66,10 @@ public interface IAzureProxy /// /// Deletes an Azure Batch task /// - /// The unique TES task ID + /// The batch task to delete /// The batch job that contains the task /// A for controlling the lifetime of the asynchronous operation. - Task DeleteBatchTaskAsync(string tesTaskId, string jobId, CancellationToken cancellationToken); + Task DeleteBatchTaskAsync(string cloudTaskId, string jobId, CancellationToken cancellationToken); /// /// Gets the counts of active batch nodes, grouped by VmSize diff --git a/src/TesApi.Web/IBatchPool.cs b/src/TesApi.Web/IBatchPool.cs index 9bf97e43a..b23c3ec37 100644 --- a/src/TesApi.Web/IBatchPool.cs +++ b/src/TesApi.Web/IBatchPool.cs @@ -59,25 +59,25 @@ public interface IBatchPool ValueTask ServicePoolAsync(CancellationToken cancellationToken = default); /// - /// Gets nonrecoverable compute node related failures that occur before tasks are assigned to compute nodes. + /// Gets compute node related task state information. /// /// /// - IAsyncEnumerable GetTaskResizeFailuresAsync(CancellationToken cancellationToken = default); + IAsyncEnumerable GetCloudTaskStatesAsync(CancellationToken cancellationToken = default); /// - /// Gets the last time the pool's compute node list was changed. + /// Gets batch tasks that need to be deleted. /// /// /// - ValueTask GetAllocationStateTransitionTimeAsync(CancellationToken cancellationToken = default); + IAsyncEnumerable GetTasksToDelete(CancellationToken cancellationToken = default); /// - /// Gets the completed tasks in this pool's associated job. + /// Gets the last time the pool's compute node list was changed. /// /// /// - IAsyncEnumerable GetCompletedTasksAsync(CancellationToken cancellationToken); + ValueTask GetAllocationStateTransitionTimeAsync(CancellationToken cancellationToken = default); /// /// A not yet assigned a compute nodes to remove due to a nonrecoverable compute node or pool resize error. diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index 52bdd6633..a5ef55a08 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +using System; using System.Collections.Generic; using System.Threading; using System.Threading.Tasks; @@ -42,7 +43,7 @@ public interface IBatchScheduler /// s corresponding to each . /// A for controlling the lifetime of the asynchronous operation. /// True for each corresponding that needs to be persisted. - IAsyncEnumerable> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken); + IAsyncEnumerable> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken); /// /// Schedule queued s on a batch system @@ -50,7 +51,7 @@ public interface IBatchScheduler /// s to schedule on the batch system. /// A for controlling the lifetime of the asynchronous operation. /// True for each that needs to be persisted. - IAsyncEnumerable> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, CancellationToken cancellationToken); + IAsyncEnumerable> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, CancellationToken cancellationToken); /// /// Adds to the managed batch pools. @@ -95,6 +96,14 @@ public interface IBatchScheduler /// . string GetTesTaskIdFromCloudTaskId(string cloudTaskId); + /// + /// Deletes azure batch tasks. + /// + /// s to delete from the batch system. + /// A for controlling the lifetime of the asynchronous operation. + /// True for each that was either deleted or not found. + IAsyncEnumerable> DeleteCloudTasksAsync(IAsyncEnumerable tasks, CancellationToken cancellationToken); + /// /// Gets unprocessed events from the storage account. /// @@ -102,5 +111,13 @@ public interface IBatchScheduler /// Optional event to retrieve. Defaults to all events. /// IAsyncEnumerable GetEventMessagesAsync(CancellationToken cancellationToken, string @event = default); + + /// + /// Identifies an azure cloud task. + /// + /// that contains the task. + /// . + /// + public record struct CloudTaskId(string JobId, string TaskId, DateTime Created); } } diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 68bb1142e..3afdefc73 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -145,7 +145,7 @@ protected async Task ExecuteActionOnIntervalAsync(TimeSpan runInterval, Func, performs an action in the batch system using , and updates the resultant state /// /// A System.Threading.Tasks.ValueTask that represents the long running operations. - protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func>> tesTaskGetter, Func>> tesTaskProcessor, CancellationToken stoppingToken, string unitsLabel = "tasks") + protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func>> tesTaskGetter, Func>> tesTaskProcessor, CancellationToken stoppingToken, string unitsLabel = "tasks") { var tesTasks = await (await tesTaskGetter(stoppingToken)).ToArrayAsync(stoppingToken); @@ -159,7 +159,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< await foreach (var tesTaskTask in tesTaskProcessor(tesTasks, stoppingToken).WithCancellation(stoppingToken)) { - var tesTask = tesTaskTask.TesTask; + var tesTask = tesTaskTask.Related; try { diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs new file mode 100644 index 000000000..1475c3ef0 --- /dev/null +++ b/src/TesApi.Web/PoolScheduler.cs @@ -0,0 +1,166 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Microsoft.Azure.Batch; +using Microsoft.Extensions.Logging; +using Tes.Models; +using Tes.Repository; +using static TesApi.Web.IBatchPool; + +namespace TesApi.Web +{ + /// + /// A background service that montitors CloudPools in the batch system, orchestrates their lifecycle, and updates their state. + /// This should only be used as a system-wide singleton service. This class does not support scale-out on multiple machines, + /// nor does it implement a leasing mechanism. In the future, consider using the Lease Blob operation. + /// + internal class PoolScheduler : OrchestrateOnBatchSchedulerServiceBase + { + /// + /// Interval between each call to . + /// + public static readonly TimeSpan RunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs uses a 30 second polling interval + + /// + /// Default constructor + /// + /// Used for requesting termination of the current application during initialization. + /// The main TES task database repository implementation + /// + /// + /// + public PoolScheduler(Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger logger) + : base(hostApplicationLifetime, repository, batchScheduler, logger) { } + + /// + protected override void ExecuteSetup(CancellationToken stoppingToken) + { + batchScheduler.LoadExistingPoolsAsync(stoppingToken).Wait(stoppingToken); // Delay starting Scheduler until this completes to finish initializing BatchScheduler. + } + + /// + protected override Task ExecuteCoreAsync(CancellationToken stoppingToken) + { + return ServiceBatchPoolsAsync(stoppingToken); + } + + /// + /// Performs an action on each batch pool. + /// + /// + /// + /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// + private async ValueTask ExecuteActionOnPoolsAsync(string pollName, Func action, CancellationToken stoppingToken) + { + ArgumentNullException.ThrowIfNull(action); + + var pools = batchScheduler.GetPools().ToList(); + + if (0 == pools.Count) + { + return; + } + + var startTime = DateTime.UtcNow; + + await Parallel.ForEachAsync(pools, stoppingToken, async (pool, token) => + { + try + { + await action(pool, token); + } + catch (Exception exc) + { + logger.LogError(exc, @"Batch pool {PoolId} threw an exception in {Poll}.", pool.Id, pollName); + } + }); + + logger.LogDebug(@"{Poll} for {PoolsCount} pools completed in {TotalSeconds} seconds.", pollName, pools.Count, DateTime.UtcNow.Subtract(startTime).TotalSeconds); + } + + /// + /// Repeatedly services all batch pools associated with this TES instance, including updating tasks. + /// + /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// + private Task ServiceBatchPoolsAsync(CancellationToken stoppingToken) + { + return ExecuteActionOnIntervalAsync(RunInterval, async cancellationToken => + { + await ExecuteActionOnPoolsAsync( + "Service Batch Pools", + async (pool, token) => + { + await pool.ServicePoolAsync(token); + await ProcessCloudTaskStatesAsync(pool.Id, pool.GetCloudTaskStatesAsync(token), token); + await ProcessDeletedTasks(pool.GetTasksToDelete(token), token); + }, + cancellationToken); + }, stoppingToken); + + async ValueTask ProcessCloudTaskStatesAsync(string poolId, IAsyncEnumerable states, CancellationToken cancellationToken) + { + var list = new List<(TesTask TesTask, AzureBatchTaskState State)>(); + + await foreach (var (cloudTaskId, state) in states.WithCancellation(cancellationToken)) + { + TesTask tesTask = default; + if (await repository.TryGetItemAsync(batchScheduler.GetTesTaskIdFromCloudTaskId(cloudTaskId), cancellationToken, task => tesTask = task) && tesTask is not null) + { + list.Add((tesTask, state)); + } + else + { + logger.LogDebug(@"Unable to locate TesTask for CloudTask '{CloudTask}' with action state {ActionState}.", cloudTaskId, state.State); + } + } + + if (list.Count != 0) + { + await OrchestrateTesTasksOnBatchAsync( + $"NodeState ({poolId})", +#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously + async _ => list.Select(t => t.TesTask).ToAsyncEnumerable(), +#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, list.Select(t => t.State).ToArray(), token), + cancellationToken); + } + else + { + logger.LogDebug("No task state changes from pool/node information this time: PoolId: {PoolId}.", poolId); + } + } + + async ValueTask ProcessDeletedTasks(IAsyncEnumerable tasks, CancellationToken cancellationToken) + { + await foreach (var taskResult in batchScheduler.DeleteCloudTasksAsync(tasks, cancellationToken).WithCancellation(cancellationToken)) + { + try + { + switch (await taskResult) + { + case true: + logger.LogDebug(@"Azure task {CloudTask} was deleted.", taskResult.Related.TaskId); + break; + + case false: + logger.LogDebug(@"Azure task {CloudTask} was NOT deleted.", taskResult.Related.TaskId); + break; + } + } + catch (Exception exc) + { + logger.LogError(exc, @"Failed to delete azure task '{CloudTask}': '{ExceptionType}': '{ExceptionMessage}'", taskResult.Related.TaskId, exc.GetType().FullName, exc.Message); + } + } + } + } + } +} diff --git a/src/TesApi.Web/TesTaskTask.cs b/src/TesApi.Web/RelatedTask.cs similarity index 84% rename from src/TesApi.Web/TesTaskTask.cs rename to src/TesApi.Web/RelatedTask.cs index 1a56c1ea7..4f3f2191a 100644 --- a/src/TesApi.Web/TesTaskTask.cs +++ b/src/TesApi.Web/RelatedTask.cs @@ -4,12 +4,13 @@ namespace TesApi.Web { /// - /// Combines a with a . + /// Combines a with a . /// /// The wrapped . - /// The wrapped . + /// The wrapped item tied to the task/result. + /// The type of . /// The type of the result produced by the . - public record class TesTaskTask(System.Threading.Tasks.Task Task, Tes.Models.TesTask TesTask) : System.IDisposable + public record class RelatedTask(System.Threading.Tasks.Task Task, TRelated Related) : System.IDisposable { /// Gets an awaiter used to await the . /// An awaiter instance. diff --git a/src/TesApi.Web/Startup.cs b/src/TesApi.Web/Startup.cs index d15c759e2..a19887af1 100644 --- a/src/TesApi.Web/Startup.cs +++ b/src/TesApi.Web/Startup.cs @@ -142,8 +142,8 @@ public void ConfigureServices(IServiceCollection services) // Order is important for hosted services .AddHostedService(sp => (AllowedVmSizesService)sp.GetRequiredService(typeof(IAllowedVmSizesService))) - .AddHostedService() - .AddHostedService(); + .AddHostedService() + .AddHostedService(); //.AddHostedService() } catch (Exception exc) diff --git a/src/TesApi.Web/Scheduler.cs b/src/TesApi.Web/TaskScheduler.cs similarity index 85% rename from src/TesApi.Web/Scheduler.cs rename to src/TesApi.Web/TaskScheduler.cs index a6b29147f..fd8dfc831 100644 --- a/src/TesApi.Web/Scheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -19,7 +19,7 @@ namespace TesApi.Web /// This should only be used as a system-wide singleton service. This class does not support scale-out on multiple machines, /// nor does it implement a leasing mechanism. In the future, consider using the Lease Blob operation. /// - internal class Scheduler : OrchestrateOnBatchSchedulerServiceBase + internal class TaskScheduler : OrchestrateOnBatchSchedulerServiceBase { private readonly TimeSpan blobRunInterval = TimeSpan.FromSeconds(5); private readonly TimeSpan batchRunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs uses a 30 second polling interval @@ -33,7 +33,7 @@ internal class Scheduler : OrchestrateOnBatchSchedulerServiceBase /// The main TES task database repository implementation /// The batch scheduler implementation /// The logger instance - public Scheduler(RunnerEventsProcessor nodeEventProcessor, Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger logger) + public TaskScheduler(RunnerEventsProcessor nodeEventProcessor, Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger logger) : base(hostApplicationLifetime, repository, batchScheduler, logger) { this.nodeEventProcessor = nodeEventProcessor; @@ -69,7 +69,6 @@ protected override Task ExecuteCoreAsync(CancellationToken stoppingToken) return Task.WhenAll( ExecuteCancelledTesTasksOnBatchAsync(stoppingToken), ExecuteQueuedTesTasksOnBatchAsync(stoppingToken), - ExecuteTerminatedTesTasksOnBatchAsync(stoppingToken), ExecuteUpdateTesTaskFromEventBlobAsync(stoppingToken)); } @@ -102,7 +101,7 @@ private Task ExecuteCancelledTesTasksOnBatchAsync(CancellationToken stoppingToke async cancellationToken => (await repository.GetItemsAsync( predicate: t => t.State == TesState.CANCELINGEnum, cancellationToken: cancellationToken)) - .OrderBy(t => t.CreationTime) + .OrderByDescending(t => t.CreationTime) .ToAsyncEnumerable()); return ExecuteActionOnIntervalAsync(batchRunInterval, @@ -117,32 +116,6 @@ private Task ExecuteCancelledTesTasksOnBatchAsync(CancellationToken stoppingToke stoppingToken); } - /// - /// Retrieves all terminated TES tasks from the database, performs an action in the batch system, and updates the resultant state - /// - /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. - /// - private Task ExecuteTerminatedTesTasksOnBatchAsync(CancellationToken stoppingToken) - { - var query = new Func>>( - async cancellationToken => (await repository.GetItemsAsync( - predicate: t => t.IsTaskDeletionRequired, - cancellationToken: cancellationToken)) - .OrderBy(t => t.CreationTime) - .ToAsyncEnumerable()); - - return ExecuteActionOnIntervalAsync(batchRunInterval, - cancellationToken => OrchestrateTesTasksOnBatchAsync( - "Terminated", - query, - (tasks, cancellationToken) => batchScheduler.ProcessTesTaskBatchStatesAsync( - tasks, - Enumerable.Repeat(new(AzureBatchTaskState.TaskState.CancellationRequested), tasks.Length).ToArray(), - cancellationToken), - cancellationToken), - stoppingToken); - } - /// /// Retrieves all event blobs from storage and updates the resultant state. /// @@ -250,7 +223,7 @@ await Parallel.ForEachAsync(markEventsProcessedList, stoppingToken, async (markE } catch (Exception ex) { - logger.LogError(ex, @""); + logger.LogError(ex, @"Failed to tag event processed."); } }); From 7b6c634c7dbcf1eed0796f5a2c1aa3d11fad42b2 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 9 Nov 2023 18:49:11 -0800 Subject: [PATCH 080/202] Move processing of output file log from server to node --- src/Tes.Runner/Events/EventsPublisher.cs | 16 ++-- src/Tes.Runner/Executor.cs | 1 + src/Tes.Runner/Transfer/BlobUploader.cs | 12 ++- src/TesApi.Web/Events/RunnerEventsMessage.cs | 7 +- .../Events/RunnerEventsProcessor.cs | 90 ++++++++++--------- 5 files changed, 71 insertions(+), 55 deletions(-) diff --git a/src/Tes.Runner/Events/EventsPublisher.cs b/src/Tes.Runner/Events/EventsPublisher.cs index 320c444e3..e4baac01a 100644 --- a/src/Tes.Runner/Events/EventsPublisher.cs +++ b/src/Tes.Runner/Events/EventsPublisher.cs @@ -102,14 +102,20 @@ public virtual async Task PublishUploadEndEventAsync(NodeTask nodeTask, int numb { { "numberOfFiles", numberOfFiles.ToString()}, { "totalSizeInBytes", totalSizeInBytes.ToString()}, - { "errorMessage", errorMessage??string.Empty} + { "errorMessage", errorMessage ?? string.Empty} }; - foreach (var (length, blobUrl, fileName, index) in completedFiles?.Select((logEntry, index) => (logEntry.Length, logEntry.BlobUrl, logEntry.FileName, index)) ?? Enumerable.Empty<(long, Uri?, string, int)>()) + if (completedFiles is not null) { - eventMessage.EventData.Add($"fileSize-{index}", length.ToString()); - eventMessage.EventData.Add($"fileUri-{index}", blobUrl?.ToString() ?? string.Empty); - eventMessage.EventData.Add($"filePath-{index}", fileName); + completedFiles = completedFiles.ToList(); + eventMessage.EventData.Add(@"fileLog-Count", completedFiles.Count().ToString("D")); + + foreach (var (logEntry, index) in completedFiles.Select((logEntry, index) => (logEntry, index))) + { + eventMessage.EventData.Add($"fileSize-{index}", logEntry.Length.ToString("D")); + eventMessage.EventData.Add($"fileUri-{index}", logEntry.BlobUrl?.AbsoluteUri ?? string.Empty); + eventMessage.EventData.Add($"filePath-{index}", logEntry.FileName); + } } await PublishAsync(eventMessage); diff --git a/src/Tes.Runner/Executor.cs b/src/Tes.Runner/Executor.cs index e2ff8232a..7968a78de 100644 --- a/src/Tes.Runner/Executor.cs +++ b/src/Tes.Runner/Executor.cs @@ -132,6 +132,7 @@ public async Task UploadOutputsAsync(BlobPipelineOptions blobPipelineOptio logger.LogError(e, "Upload operation failed"); statusMessage = EventsPublisher.FailedStatus; errorMessage = e.Message; + completedFiles = default; throw; } finally diff --git a/src/Tes.Runner/Transfer/BlobUploader.cs b/src/Tes.Runner/Transfer/BlobUploader.cs index 4c661b23a..5f3733d03 100644 --- a/src/Tes.Runner/Transfer/BlobUploader.cs +++ b/src/Tes.Runner/Transfer/BlobUploader.cs @@ -14,7 +14,7 @@ public class BlobUploader : BlobOperationPipeline { private readonly ConcurrentDictionary hashListProviders = new(); - internal readonly IDictionary MapPathToMountPrefixLength = new Dictionary(); + internal readonly IDictionary mapPathToMountPrefixLength = new Dictionary(); internal readonly ConcurrentBag CompletedFiles = new(); public BlobUploader(BlobPipelineOptions pipelineOptions, Channel memoryBufferPool) : base(pipelineOptions, memoryBufferPool) @@ -143,12 +143,16 @@ public override async Task OnCompletionAsync(long length, Uri? blobUrl, string f throw; } finally + { + response?.Dispose(); + } + + if (mapPathToMountPrefixLength.TryGetValue(fileName, out var prefixLength)) { CompletedFiles.Add(new( length, new Azure.Storage.Blobs.BlobUriBuilder(blobUrl) { Sas = null }.ToUri(), - MapPathToMountPrefixLength.TryGetValue(fileName, out var prefixLength) ? fileName[prefixLength..] : fileName)); - response?.Dispose(); + fileName[prefixLength..])); } } @@ -164,7 +168,7 @@ public virtual async Task UploadAsync(List uploadList) foreach (var upload in uploadList.Where(upload => !string.IsNullOrWhiteSpace(upload.MountParentDirectory)).Where(upload => upload.FullFilePath.StartsWith(upload.MountParentDirectory!))) { - MapPathToMountPrefixLength[upload.FullFilePath] = upload.MountParentDirectory!.Length; + mapPathToMountPrefixLength[upload.FullFilePath] = upload.MountParentDirectory!.Length; } var operationList = uploadList.Select(d => new BlobOperationInfo(d.TargetUri, d.FullFilePath, d.FullFilePath, true)).ToList(); diff --git a/src/TesApi.Web/Events/RunnerEventsMessage.cs b/src/TesApi.Web/Events/RunnerEventsMessage.cs index 033979ba6..1116d1588 100644 --- a/src/TesApi.Web/Events/RunnerEventsMessage.cs +++ b/src/TesApi.Web/Events/RunnerEventsMessage.cs @@ -18,16 +18,17 @@ public record struct RunnerEventsMessage(Uri BlobUri, IDictionary /// Copy constructor replacing . /// - /// - /// + /// . + /// Content of this event message. public RunnerEventsMessage(RunnerEventsMessage original, Tes.Runner.Events.EventMessage runnerEventMessage) : this(original.BlobUri, original.Tags, original.Event, runnerEventMessage) { } /// - /// + /// The associated with this event. /// + /// This property is only populated when the content is included. public readonly string TesTaskId => RunnerEventMessage?.EntityId; } } diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index c7daad3bd..251320f81 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -27,9 +27,9 @@ public class RunnerEventsProcessor /// /// Constructor of . /// - /// - /// - /// + /// Azure API wrapper. + /// Methods for abstracting storage access. + /// Methods for abstracting storage access. public RunnerEventsProcessor(IAzureProxy azureProxy, Storage.IStorageAccessProvider storageAccessProvider, ILogger logger) { ArgumentNullException.ThrowIfNull(azureProxy); @@ -42,10 +42,10 @@ public RunnerEventsProcessor(IAzureProxy azureProxy, Storage.IStorageAccessProvi /// - /// TODO + /// Validate the . /// - /// - /// + /// Tes runner event message metadata. + /// Validation exceptions. public void ValidateMessageMetadata(RunnerEventsMessage message) { if (message.BlobUri is null) @@ -75,7 +75,7 @@ public void ValidateMessageMetadata(RunnerEventsMessage message) if (!message.Tags.ContainsKey("event-name") || !message.Tags.ContainsKey("task-id") || !message.Tags.ContainsKey("created")) { - throw new ArgumentException("This message is missing needed tags.", nameof(message)); + throw new ArgumentException("This message is missing required tags.", nameof(message)); } // There are up to 10 tags allowed. We will be adding one. @@ -89,9 +89,11 @@ public void ValidateMessageMetadata(RunnerEventsMessage message) /// /// Gets the details of this event message. /// - /// - /// - /// + /// Tes runner event message metadata. + /// A for controlling the lifetime of the asynchronous operation. + /// Validation exceptions. + /// A containing the associated . + /// This method assumes was successfully validated by . public async Task DownloadAndValidateMessageContentAsync(RunnerEventsMessage message, CancellationToken cancellationToken) { Tes.Runner.Events.EventMessage content; @@ -195,10 +197,11 @@ static void Assert([System.Diagnostics.CodeAnalysis.DoesNotReturnIf(false)] bool /// /// Returns a sequence in the order the events were produced. /// - /// - /// - /// - /// + /// 's enumerated type. + /// Unordered enumeration of events. + /// Function that returns from . + /// Ordered enumeration of events. + /// This method assumes every was successfully validated by . public IEnumerable OrderProcessedByExecutorSequence(IEnumerable source, Func messageGetter) { ArgumentNullException.ThrowIfNull(source); @@ -215,16 +218,17 @@ static int ThenBy(RunnerEventsMessage message) : message.RunnerEventMessage.Name); static int ParseEventName(string eventName) - => EventsInOrder.TryGetValue(eventName, out var result) ? result : int.MinValue; + => EventsInOrder.TryGetValue(eventName, out var result) ? result : 0; } /// /// Gets the task status details from this event message. /// - /// - /// - /// - /// + /// Tes runner event message metadata. + /// associated with . + /// A for controlling the lifetime of the asynchronous operation. + /// populated from . + /// This method assumes was returned by . public async Task GetMessageBatchStateAsync(RunnerEventsMessage message, Tes.Models.TesTask tesTask, CancellationToken cancellationToken) { ArgumentNullException.ThrowIfNull(message.RunnerEventMessage, nameof(message)); @@ -280,7 +284,7 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes { Tes.Runner.Events.EventsPublisher.SuccessStatus => new( AzureBatchTaskState.TaskState.InfoUpdate, - OutputFileLogs: GetFileLogs(nodeMessage.EventData)), + OutputFileLogs: GetOutputFileLogs(nodeMessage.EventData)), Tes.Runner.Events.EventsPublisher.FailedStatus => new( AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed, @@ -318,33 +322,34 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes return state; // Helpers - async ValueTask> AddProcessLogsIfAvailable(Tes.Runner.Events.EventMessage message, Tes.Models.TesTask tesTask, CancellationToken cancellationToken) - { - var processLogs = await GetProcessLogs(message, tesTask, cancellationToken).ToListAsync(cancellationToken); - - if (processLogs.Any()) - { - processLogs.Insert(0, "Possibly relevant logs:"); - } - - return processLogs; - } - - static IEnumerable GetFileLogs(IDictionary eventData) + static IEnumerable GetOutputFileLogs(IDictionary eventData) { - if (eventData is null) + if (eventData is null || !eventData.ContainsKey("fileLog-Count")) { yield break; } - var numberOfFiles = int.Parse(eventData["numberOfFiles"]); + var numberOfFiles = int.Parse(eventData["fileLog-Count"], System.Globalization.CultureInfo.InvariantCulture); + for (var i = 0; i < numberOfFiles; ++i) { yield return new( new Uri(eventData[$"fileUri-{i}"]), eventData[$"filePath-{i}"], - long.Parse(eventData[$"fileSize-{i}"])); + long.Parse(eventData[$"fileSize-{i}"], System.Globalization.CultureInfo.InvariantCulture)); + } + } + + async ValueTask> AddProcessLogsIfAvailable(Tes.Runner.Events.EventMessage message, Tes.Models.TesTask tesTask, CancellationToken cancellationToken) + { + var processLogs = await GetProcessLogs(message, tesTask, cancellationToken).ToListAsync(cancellationToken); + + if (processLogs.Any()) + { + processLogs.Insert(0, "Possibly relevant logs:"); } + + return processLogs; } async IAsyncEnumerable GetProcessLogs(Tes.Runner.Events.EventMessage message, Tes.Models.TesTask tesTask, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) @@ -362,11 +367,9 @@ async IAsyncEnumerable GetProcessLogs(Tes.Runner.Events.EventMessage mes yield break; } - var listUri = await storageAccessProvider.GetInternalTesTaskBlobUrlAsync(tesTask, string.Empty, Azure.Storage.Sas.BlobSasPermissions.List, cancellationToken); - - await foreach (var uri in azureProxy.ListBlobsAsync(new(listUri), cancellationToken) - .Where(blob => blob.BlobName.EndsWith(".txt") && System.IO.Path.GetFileName(blob.BlobName).StartsWith(blobNameStartsWith)) - .OrderBy(blob => blob.BlobName) + await foreach (var uri in azureProxy.ListBlobsAsync(new(await storageAccessProvider.GetInternalTesTaskBlobUrlAsync(tesTask, string.Empty, Azure.Storage.Sas.BlobSasPermissions.List, cancellationToken)), cancellationToken) + .Where(blob => blob.BlobName.EndsWith(".txt") && blob.BlobName.Split('/').Last().StartsWith(blobNameStartsWith)) + .OrderBy(blob => blob.BlobName) // Not perfect ordering, but reasonable. This is more likely to be read by people rather then machines. Perfect would involve regex. .Select(blob => blob.BlobUri) .WithCancellation(cancellationToken)) { @@ -378,9 +381,10 @@ async IAsyncEnumerable GetProcessLogs(Tes.Runner.Events.EventMessage mes /// /// Marks this event message processed. /// - /// - /// + /// Tes runner event message metadata. + /// A for controlling the lifetime of the asynchronous operation. /// + /// This method assumes was successfully validated by . public async Task MarkMessageProcessedAsync(RunnerEventsMessage message, CancellationToken cancellationToken) { await azureProxy.SetBlobTags( From feb8ccc16c03858288c910db5f1ea7ef4091f14d Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 9 Nov 2023 21:10:57 -0800 Subject: [PATCH 081/202] Log retries --- .../CacheAndRetryHandlerTest.cs | 6 +- .../TerraLandingZoneApiClientTest.cs | 21 +- .../TerraWsmApiClientTests.cs | 15 +- src/Tes.ApiClients/CachingRetryHandler.cs | 21 +- src/Tes.ApiClients/HttpApiClient.cs | 102 ++++-- src/Tes.ApiClients/RetryHandler.cs | 109 +++++-- src/Tes.ApiClients/TerraWsmApiClient.cs | 5 +- .../ContainerRegistryProviderTests.cs | 6 +- .../TestServices/TestServiceProvider.cs | 1 + src/TesApi.Web/AzureProxy.cs | 97 ++++-- .../CachingWithRetriesAzureProxy.cs | 304 ++++++++++++++---- .../PagedInterfaceExtensions.cs | 22 +- .../ArmResourceInformationFinder.cs | 1 + .../Management/ContainerRegistryProvider.cs | 5 +- 14 files changed, 532 insertions(+), 183 deletions(-) rename src/TesApi.Web/{ => Extensions}/PagedInterfaceExtensions.cs (92%) diff --git a/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs b/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs index 476512671..efb641858 100644 --- a/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs +++ b/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs @@ -75,7 +75,7 @@ public async Task ExecuteWithRetryAndCachingAsync_ValueIsNotCachedOnFailureAndTh await Assert.ThrowsExceptionAsync(() => cachingRetryHandler.ExecuteWithRetryAndCachingAsync(cacheKey, _ => Task.Run(() => mockInstanceToRetry.Object.ToString()), System.Threading.CancellationToken.None)); - Assert.IsFalse(appCache.TryGetValue(cacheKey, out string _)); + Assert.IsFalse(appCache.TryGetValue(cacheKey, out string? _)); } [TestMethod] @@ -92,7 +92,7 @@ public async Task ExecuteHttpRequestWithRetryAsync_RetriesOnlyOnExpectedFailureC mockFactory.Setup(f => f.CreateResponseAsync()).Returns(CreateResponseAsync(statusCode)); var response = - await cachingRetryHandler.ExecuteHttpRequestWithRetryAsync(_ => + await cachingRetryHandler.ExecuteWithRetryAsync(_ => mockFactory.Object.CreateResponseAsync(), System.Threading.CancellationToken.None); @@ -150,7 +150,7 @@ await cachingRetryHandler.ExecuteHttpRequestWithRetryAsync(_ => // Assert.IsFalse(appCache.TryGetValue(cacheKey, out HttpResponseMessage _)); // } - private Task CreateResponseAsync(HttpStatusCode statusCode) + private static Task CreateResponseAsync(HttpStatusCode statusCode) => Task.FromResult(new(statusCode)); public interface ITestHttpResponseMessageFactory diff --git a/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs b/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs index 646ad4500..7e975ec6d 100644 --- a/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs +++ b/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs @@ -21,6 +21,9 @@ public void SetUp() terraApiStubData = new TerraApiStubData(); tokenCredential = new Mock(); cacheAndRetryHandler = new Mock(); + var cache = new Mock(); + cache.Setup(c => c.CreateEntry(It.IsAny())).Returns(new Mock().Object); + cacheAndRetryHandler.SetupGet(c => c.AppCache).Returns(cache.Object); terraLandingZoneApiClient = new TerraLandingZoneApiClient(TerraApiStubData.LandingZoneApiHost, tokenCredential.Object, cacheAndRetryHandler.Object, NullLogger.Instance); } @@ -28,8 +31,13 @@ public void SetUp() public async Task GetResourceQuotaAsync_ValidResourceIdReturnsQuotaInformationAndGetsAuthToken() { var body = terraApiStubData.GetResourceQuotaApiResponseInJson(); - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAndCachingAsync(It.IsAny(), - It.IsAny>>(), It.IsAny())) + + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync( + It.IsAny>>(), It.IsAny(), It.IsAny())) + .ReturnsAsync(new HttpResponseMessage()); + + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync( + It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(body); var quota = await terraLandingZoneApiClient.GetResourceQuotaAsync(terraApiStubData.LandingZoneId, terraApiStubData.BatchAccountId, cacheResults: true, cancellationToken: CancellationToken.None); @@ -55,8 +63,12 @@ public async Task GetLandingZoneResourcesAsync_ListOfLandingZoneResourcesAndGets { var body = terraApiStubData.GetResourceApiResponseInJson(); - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAndCachingAsync(It.IsAny(), - It.IsAny>>(), It.IsAny())) + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync( + It.IsAny>>(), It.IsAny(), It.IsAny())) + .ReturnsAsync(new HttpResponseMessage()); + + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync( + It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(body); var resources = await terraLandingZoneApiClient.GetLandingZoneResourcesAsync(terraApiStubData.LandingZoneId, CancellationToken.None); @@ -67,7 +79,6 @@ public async Task GetLandingZoneResourcesAsync_ListOfLandingZoneResourcesAndGets tokenCredential.Verify(t => t.GetTokenAsync(It.IsAny(), It.IsAny()), Times.Once); - } [TestMethod] diff --git a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs index 8d75ce40d..3e6bf460e 100644 --- a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs +++ b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs @@ -25,6 +25,9 @@ public void SetUp() terraApiStubData = new TerraApiStubData(); tokenCredential = new Mock(); cacheAndRetryHandler = new Mock(); + var cache = new Mock(); + cache.Setup(c => c.CreateEntry(It.IsAny())).Returns(new Mock().Object); + cacheAndRetryHandler.SetupGet(c => c.AppCache).Returns(cache.Object); terraWsmApiClient = new TerraWsmApiClient(TerraApiStubData.WsmApiHost, tokenCredential.Object, cacheAndRetryHandler.Object, NullLogger.Instance); } @@ -89,9 +92,12 @@ public async Task GetSasTokenAsync_ValidRequest_ReturnsPayload() Content = new StringContent(terraApiStubData.GetWsmSasTokenApiResponseInJson()) }; - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny())) + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + .Returns((Func> action, CancellationToken cancellationToken, Polly.Context _2) => action(cancellationToken)); + var apiResponse = await terraWsmApiClient.GetSasTokenAsync(terraApiStubData.WorkspaceId, terraApiStubData.ContainerResourceId, null!, CancellationToken.None); @@ -108,9 +114,12 @@ public async Task GetContainerResourcesAsync_ValidRequest_ReturnsPayload() Content = new StringContent(terraApiStubData.GetContainerResourcesApiResponseInJson()) }; - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny())) + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + .Returns((Func> action, CancellationToken cancellationToken, Polly.Context _2) => action(cancellationToken)); + var apiResponse = await terraWsmApiClient.GetContainerResourcesAsync(terraApiStubData.WorkspaceId, offset: 0, limit: 10, CancellationToken.None); @@ -125,7 +134,7 @@ public async Task DeleteBatchPoolAsync_204Response_Succeeds() var wsmResourceId = Guid.NewGuid(); var response = new HttpResponseMessage(HttpStatusCode.NoContent); - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny())) + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); await terraWsmApiClient.DeleteBatchPoolAsync(terraApiStubData.WorkspaceId, wsmResourceId, CancellationToken.None); diff --git a/src/Tes.ApiClients/CachingRetryHandler.cs b/src/Tes.ApiClients/CachingRetryHandler.cs index 228349964..5bc94e775 100644 --- a/src/Tes.ApiClients/CachingRetryHandler.cs +++ b/src/Tes.ApiClients/CachingRetryHandler.cs @@ -3,6 +3,7 @@ using Microsoft.Extensions.Caching.Memory; using Microsoft.Extensions.Options; +using Polly; using Tes.ApiClients.Options; namespace Tes.ApiClients @@ -43,12 +44,13 @@ protected CachingRetryHandler() { } /// /// Action to execute /// A for controlling the lifetime of the asynchronous operation. + /// /// - public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, CancellationToken cancellationToken) + public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, CancellationToken cancellationToken, Context? context = default) { ValidateArgs(cacheKey, action); - return await ExecuteWithCacheAsync(cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken)); + return await ExecuteWithCacheAsync(cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken, context)); } /// @@ -58,23 +60,14 @@ public virtual async Task ExecuteWithRetryAndCachingAsync(stri /// Action to execute /// /// A for controlling the lifetime of the asynchronous operation. + /// /// /// - public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, DateTimeOffset cachesExpires, CancellationToken cancellationToken) + public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, DateTimeOffset cachesExpires, CancellationToken cancellationToken, Context? context = default) { ValidateArgs(cacheKey, action); - return await ExecuteWithCacheAsync(cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken), cachesExpires); - } - - private static void ValidateArgs(string cacheKey, Func action) - { - ArgumentNullException.ThrowIfNull(action); - - if (string.IsNullOrEmpty(cacheKey)) - { - throw new ArgumentNullException(nameof(cacheKey), "Invalid cache key. The value can't be null or empty"); - } + return await ExecuteWithCacheAsync(cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken, context), cachesExpires); } private static void ValidateArgs(string cacheKey, Func action) diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index 26e482ba0..03b2cd3b1 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -6,6 +6,7 @@ using System.Text; using System.Text.Json; using Azure.Core; +using Microsoft.Extensions.Caching.Memory; using Microsoft.Extensions.Logging; namespace Tes.ApiClients @@ -68,6 +69,36 @@ protected HttpApiClient(TokenCredential tokenCredential, string tokenScope, /// protected HttpApiClient() { } + /// + /// A logging Polly retry handler. + /// + /// Calling method name. + /// + private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string? caller = default) + => new((exception, timeSpan, retryCount, correlationId) => + { + Logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan} CorrelationId: {CorrelationId:D}", caller, retryCount, timeSpan, correlationId); + }); + + /// + /// A logging Polly retry handler. + /// + /// See + /// Calling method name. + /// + private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string? caller = default) + => new((result, timeSpan, retryCount, correlationId) => + { + if (result.Exception is null) + { + Logger?.LogError(@"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan} CorrelationId: {CorrelationId:D}", caller, retryCount, timeSpan, correlationId); + } + else + { + Logger?.LogError(result.Exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan} CorrelationId: {CorrelationId:D}", caller, retryCount, timeSpan, correlationId); + } + }); + /// /// Sends request with a retry policy /// @@ -78,7 +109,10 @@ protected HttpApiClient() { } /// protected async Task HttpSendRequestWithRetryPolicyAsync( Func httpRequestFactory, CancellationToken cancellationToken, bool setAuthorizationHeader = false) - => await cachingRetryHandler.ExecuteWithRetryAsync(async ct => + { + var ctx = new Polly.Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return await cachingRetryHandler.ExecuteWithRetryAsync(async ct => { var request = httpRequestFactory(); if (setAuthorizationHeader) @@ -87,7 +121,8 @@ protected async Task HttpSendRequestWithRetryPolicyAsync( } return await HttpClient.SendAsync(request, ct); - }, cancellationToken); + }, cancellationToken, ctx); + } /// /// Sends a Http Get request to the URL and returns body response as string @@ -136,12 +171,20 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri re { var cacheKey = await ToCacheKeyAsync(requestUrl, setAuthorizationHeader, cancellationToken); - return await cachingRetryHandler.ExecuteWithRetryAndCachingAsync(cacheKey, async ct => + return (await cachingRetryHandler.AppCache.GetOrCreateAsync(cacheKey, async _ => { - var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); + var ctx = new Polly.Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + var response = await cachingRetryHandler.ExecuteWithRetryAsync(async ct => + { + var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); + + var httpResponse = await HttpClient.SendAsync(httpRequest, ct); + return httpResponse.EnsureSuccessStatusCode(); + }, cancellationToken, ctx); - return await ExecuteRequestAndReadResponseBodyAsync(httpRequest, ct); - }, cancellationToken); + return await ReadResponseBodyAsync(response, cancellationToken); + }))!; } /// @@ -153,13 +196,20 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri re /// protected async Task HttpGetRequestWithRetryPolicyAsync(Uri requestUrl, CancellationToken cancellationToken, bool setAuthorizationHeader = false) - => await cachingRetryHandler.ExecuteWithRetryAsync(async ct => + { + var ctx = new Polly.Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + var response = await cachingRetryHandler.ExecuteWithRetryAsync(async ct => { //request must be recreated in every retry. var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); - return await ExecuteRequestAndReadResponseBodyAsync(httpRequest, ct); - }, cancellationToken); + var httpResponse = await HttpClient.SendAsync(httpRequest, ct); + return httpResponse.EnsureSuccessStatusCode(); + }, cancellationToken, ctx); + + return await ReadResponseBodyAsync(response, cancellationToken); + } /// /// Returns an query string key-value, with the value escaped. If the value is null or empty returns an empty string @@ -167,7 +217,7 @@ protected async Task HttpGetRequestWithRetryPolicyAsync(Uri requestUrl, /// parameter name /// parameter value /// - protected string ParseQueryStringParameter(string name, string value) + protected static string ParseQueryStringParameter(string name, string value) { ArgumentException.ThrowIfNullOrEmpty(name); @@ -184,15 +234,15 @@ protected string ParseQueryStringParameter(string name, string value) /// /// /// - protected string AppendQueryStringParams(params string[] arguments) + protected static string AppendQueryStringParams(params string[] arguments) { if (arguments.Length == 0) { return string.Empty; } - var queryString = ""; - var prefix = ""; + var queryString = string.Empty; + var prefix = string.Empty; foreach (var argument in arguments) { @@ -218,13 +268,11 @@ private async Task CreateGetHttpRequest(Uri requestUrl, bool return httpRequest; } - private static async Task ExecuteRequestAndReadResponseBodyAsync(HttpRequestMessage request, CancellationToken cancellationToken) + protected async Task ReadResponseBodyAsync(HttpResponseMessage response, CancellationToken cancellationToken) { - var response = await HttpClient.SendAsync(request, cancellationToken); - - response.EnsureSuccessStatusCode(); - - return await response.Content.ReadAsStringAsync(cancellationToken); + var ctx = new Polly.Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return await cachingRetryHandler.ExecuteWithRetryAsync(response.Content.ReadAsStringAsync, cancellationToken, ctx); } private async Task AddAuthorizationHeaderToRequestAsync(HttpRequestMessage requestMessage, CancellationToken cancellationToken) @@ -235,7 +283,7 @@ private async Task AddAuthorizationHeaderToRequestAsync(HttpRequestMessage reque nameof(tokenScope)); } - Logger.LogTrace("Getting token for scope:{}", tokenScope); + Logger.LogTrace("Getting token for scope:{TokenScope}", tokenScope); try { @@ -245,8 +293,8 @@ private async Task AddAuthorizationHeaderToRequestAsync(HttpRequestMessage reque } catch (Exception e) { - Logger.LogError(@"Failed to set authentication header with the access token for scope:{tokenScope}", - e); + Logger.LogError(e, @"Failed to set authentication header with the access token for scope:{TokenScope}", + tokenScope); throw; } } @@ -260,7 +308,7 @@ private async Task GetOrRefreshAccessTokenAsync(CancellationToken cancel if (DateTimeOffset.UtcNow < accessToken.ExpiresOn) { Logger.LogTrace( - $"Using existing token. Token has not expired. Token expiration date: {accessToken.ExpiresOn}"); + @"Using existing token. Token has not expired. Token expiration date: {TokenExpiresOn}", accessToken.ExpiresOn); return accessToken.Token; } @@ -268,7 +316,7 @@ private async Task GetOrRefreshAccessTokenAsync(CancellationToken cancel new TokenRequestContext(new[] { tokenScope }), cancellationToken); - Logger.LogTrace($"Returning a new token with an expiration date of: {newAccessToken.ExpiresOn}"); + Logger.LogTrace(@"Returning a new token with an expiration date of: {TokenExpiresOn}", newAccessToken.ExpiresOn); accessToken = newAccessToken; return accessToken.Token; } @@ -303,7 +351,7 @@ private string ToHash(string input) { var hash = sha256.ComputeHash(Encoding.ASCII.GetBytes(input)); - return hash.Aggregate("", (current, t) => current + t.ToString("X2")); + return hash.Aggregate(string.Empty, (current, t) => current + t.ToString("X2")); } /// @@ -313,11 +361,11 @@ private string ToHash(string input) /// A for controlling the lifetime of the asynchronous operation. /// Response's content deserialization type /// - protected static async Task GetApiResponseContentAsync(HttpResponseMessage response, CancellationToken cancellationToken) + protected async Task GetApiResponseContentAsync(HttpResponseMessage response, CancellationToken cancellationToken) { response.EnsureSuccessStatusCode(); - return JsonSerializer.Deserialize(await response.Content.ReadAsStringAsync(cancellationToken))!; + return JsonSerializer.Deserialize(await ReadResponseBodyAsync(response, cancellationToken))!; } } } diff --git a/src/Tes.ApiClients/RetryHandler.cs b/src/Tes.ApiClients/RetryHandler.cs index 59af57461..c6a14b203 100644 --- a/src/Tes.ApiClients/RetryHandler.cs +++ b/src/Tes.ApiClients/RetryHandler.cs @@ -19,6 +19,32 @@ public class RetryHandler private readonly AsyncRetryPolicy asyncRetryPolicy = null!; private readonly AsyncRetryPolicy asyncHttpRetryPolicy = null!; + /// + /// The key in where or is stored. + /// + public const string OnRetryHandlerKey = "OnRetryHandler"; + + /// + /// The action to call on each retry. + /// + /// The handled exception. + /// The current sleep duration. + /// The current retry count. It starts at 1 between the first handled condition and the first wait, then 2, etc. + /// A Guid guaranteed to be unique to each execution. Acts as a correlation id so that events specific to a single execution can be identified in logging and telemetry. + /// This is called right before the wait. + public delegate void OnRetryHandler(Exception outcome, TimeSpan timespan, int retryCount, Guid correlationId); + + /// + /// The action to call on each retry. + /// + /// See . + /// The handled exception or result. + /// The current sleep duration. + /// The current retry count. It starts at 1 between the first handled condition and the first wait, then 2, etc. + /// A Guid guaranteed to be unique to each execution. Acts as a correlation id so that events specific to a single execution can be identified in logging and telemetry. + /// This is called right before the wait. + public delegate void OnRetryHandler(DelegateResult result, TimeSpan timespan, int retryCount, Guid correlationId); + /// /// Synchronous retry policy instance. /// @@ -32,17 +58,27 @@ public RetryHandler(IOptions retryPolicyOptions) .Handle() .WaitAndRetry(retryPolicyOptions.Value.MaxRetryCount, (attempt) => TimeSpan.FromSeconds(Math.Pow(retryPolicyOptions.Value.ExponentialBackOffExponent, - attempt))); + attempt)), OnRetry); this.asyncRetryPolicy = Policy .Handle() .WaitAndRetryAsync(retryPolicyOptions.Value.MaxRetryCount, (attempt) => TimeSpan.FromSeconds(Math.Pow(retryPolicyOptions.Value.ExponentialBackOffExponent, - attempt))); + attempt)), OnRetry); this.asyncHttpRetryPolicy = HttpPolicyExtensions.HandleTransientHttpError() .OrResult(r => r.StatusCode == HttpStatusCode.TooManyRequests) .WaitAndRetryAsync(retryPolicyOptions.Value.MaxRetryCount, (attempt) => TimeSpan.FromSeconds(Math.Pow(retryPolicyOptions.Value.ExponentialBackOffExponent, - attempt))); + attempt)), OnRetry); + } + + public static void OnRetry(DelegateResult result, TimeSpan span, int retryCount, Context ctx) + { + ctx.GetOnRetryHandler()?.Invoke(result, span, retryCount, ctx.CorrelationId); + } + + public static void OnRetry(Exception outcome, TimeSpan timespan, int retryCount, Context ctx) + { + ctx.GetOnRetryHandler()?.Invoke(outcome, timespan, retryCount, ctx.CorrelationId); } /// @@ -59,37 +95,14 @@ protected RetryHandler() { } /// Executes a delegate with the specified policy. /// /// Action to execute - /// Result instance - public void ExecuteWithRetry(Action action) - { - ArgumentNullException.ThrowIfNull(action); - - retryPolicy.Execute(action); - } - - /// - /// Executes a delegate with the specified policy. - /// - /// Action to execute - /// Result instance - public TResult ExecuteWithRetry(Func action) - { - ArgumentNullException.ThrowIfNull(action); - - return retryPolicy.Execute(action); - } - - /// - /// Executes a delegate with the specified async policy. - /// - /// Action to execute + /// /// Result type /// Result instance - public virtual Task ExecuteWithRetryAsync(Func> action) + public TResult ExecuteWithRetry(Func action, Context? context = default) { ArgumentNullException.ThrowIfNull(action); - return asyncRetryPolicy.ExecuteAsync(action); + return retryPolicy.Execute(_ => action(), context ?? new()); } /// @@ -97,13 +110,14 @@ public virtual Task ExecuteWithRetryAsync(Func> /// /// Action to execute /// A for controlling the lifetime of the asynchronous operation. + /// /// Result type /// Result instance - public virtual Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken) + public virtual Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, Context? context = default) { ArgumentNullException.ThrowIfNull(action); - return asyncRetryPolicy.ExecuteAsync(action, cancellationToken); + return asyncRetryPolicy.ExecuteAsync((_, ct) => action(ct), context ?? new(), cancellationToken); } /// @@ -111,12 +125,13 @@ public virtual Task ExecuteWithRetryAsync(Func /// Action to execute /// A for controlling the lifetime of the asynchronous operation. + /// /// Result instance - public async Task ExecuteWithRetryAsync(Func action, CancellationToken cancellationToken) + public async Task ExecuteWithRetryAsync(Func action, CancellationToken cancellationToken, Context? context = default) { ArgumentNullException.ThrowIfNull(action); - await asyncRetryPolicy.ExecuteAsync(action, cancellationToken); + await asyncRetryPolicy.ExecuteAsync((_, ct) => action(ct), context ?? new(), cancellationToken); } /// @@ -124,11 +139,35 @@ public async Task ExecuteWithRetryAsync(Func action, Ca /// /// Action to execute /// A for controlling the lifetime of the asynchronous operation. + /// /// Result HttpResponse - public virtual async Task ExecuteHttpRequestWithRetryAsync(Func> action, CancellationToken cancellationToken) + public virtual async Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, Context? context = default) { ArgumentNullException.ThrowIfNull(action); - return await asyncHttpRetryPolicy.ExecuteAsync(action, cancellationToken); + return await asyncHttpRetryPolicy.ExecuteAsync((_, ct) => action(ct), context ?? new(), cancellationToken); + } +} + +public static class RetryHandlerExtensions +{ + public static void SetOnRetryHandler(this Context context, RetryHandler.OnRetryHandler onRetry) + { + context[RetryHandler.OnRetryHandlerKey] = onRetry; + } + + public static RetryHandler.OnRetryHandler? GetOnRetryHandler(this Context context) + { + return context.TryGetValue(RetryHandler.OnRetryHandlerKey, out var handler) ? (RetryHandler.OnRetryHandler)handler : default; + } + + public static void SetOnRetryHandler(this Context context, RetryHandler.OnRetryHandler onRetry) + { + context[RetryHandler.OnRetryHandlerKey] = onRetry; + } + + public static RetryHandler.OnRetryHandler? GetOnRetryHandler(this Context context) + { + return context.TryGetValue(RetryHandler.OnRetryHandlerKey, out var handler) ? (RetryHandler.OnRetryHandler)handler : default; } } diff --git a/src/Tes.ApiClients/TerraWsmApiClient.cs b/src/Tes.ApiClients/TerraWsmApiClient.cs index 966f25921..c22308491 100644 --- a/src/Tes.ApiClients/TerraWsmApiClient.cs +++ b/src/Tes.ApiClients/TerraWsmApiClient.cs @@ -181,10 +181,11 @@ public string GetDeleteBatchPoolUrl(Guid workspaceId, Guid wsmBatchPoolResourceI private async Task LogResponseContentAsync(HttpResponseMessage response, string errMessage, Exception ex, CancellationToken cancellationToken) { - var responseContent = ""; + var responseContent = string.Empty; + if (response is not null) { - responseContent = await response.Content.ReadAsStringAsync(cancellationToken); + responseContent = await ReadResponseBodyAsync(response, cancellationToken); } Logger.LogError(ex, $"{errMessage}. Response content:{responseContent}"); diff --git a/src/TesApi.Tests/ContainerRegistryProviderTests.cs b/src/TesApi.Tests/ContainerRegistryProviderTests.cs index 7c089f35b..3a94ab463 100644 --- a/src/TesApi.Tests/ContainerRegistryProviderTests.cs +++ b/src/TesApi.Tests/ContainerRegistryProviderTests.cs @@ -51,7 +51,7 @@ public async Task GetContainerRegistryInfoAsync_ServerIsAccessible_ReturnsAndAdd var server = "registry.com"; var image = $"{server}/image"; retryHandlerMock.Setup(r => - r.ExecuteWithRetryAsync(It.IsAny>>>(), It.IsAny())) + r.ExecuteWithRetryAsync(It.IsAny>>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(new List() { new ContainerRegistryInfo() { RegistryServer = server } @@ -81,7 +81,7 @@ public async Task GetContainerRegistryInfoAsync_ServerInCache_ReturnsRegistryInf Assert.AreEqual(server, container.RegistryServer); appCacheMock.Verify(c => c.TryGetValue(It.Is(v => $"{nameof(ContainerRegistryProvider)}:{image}".Equals(v)), out It.Ref.IsAny), Times.Once()); retryHandlerMock.Verify(r => - r.ExecuteWithRetryAsync(It.IsAny>>>(), It.IsAny()), Times.Never); + r.ExecuteWithRetryAsync(It.IsAny>>>(), It.IsAny(), It.IsAny()), Times.Never); } [TestMethod] @@ -103,7 +103,7 @@ public async Task GetContainerRegistryInfoAsync_NoAccessibleServerNoServerCached retryHandlerMock.Setup(r => r.ExecuteWithRetryAsync( It.IsAny>>>(), - It.IsAny())) + It.IsAny(), It.IsAny())) .ReturnsAsync(new List() { new ContainerRegistryInfo() { RegistryServer = server } diff --git a/src/TesApi.Tests/TestServices/TestServiceProvider.cs b/src/TesApi.Tests/TestServices/TestServiceProvider.cs index fb4f5efaf..2ca1ab658 100644 --- a/src/TesApi.Tests/TestServices/TestServiceProvider.cs +++ b/src/TesApi.Tests/TestServices/TestServiceProvider.cs @@ -83,6 +83,7 @@ internal TestServiceProvider( .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) + .AddTransient>(_ => NullLogger.Instance) .AddSingleton() .AddSingleton() .AddSingleton() diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index ddd42f3cb..15fc6f456 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -24,7 +24,10 @@ using Newtonsoft.Json; using Polly; using Polly.Retry; +using Tes.ApiClients; +using Tes.ApiClients.Options; using Tes.Models; +using TesApi.Web.Extensions; using TesApi.Web.Management.Batch; using TesApi.Web.Management.Configuration; using TesApi.Web.Storage; @@ -46,11 +49,7 @@ namespace TesApi.Web public partial class AzureProxy : IAzureProxy { private const char BatchJobAttemptSeparator = '-'; - private static readonly AsyncRetryPolicy batchRaceConditionJobNotFoundRetryPolicy = Policy - .Handle(ex => ex.RequestInformation.BatchError.Code == BatchErrorCodeStrings.JobNotFound) - .WaitAndRetryAsync(5, retryAttempt => TimeSpan.FromSeconds(Math.Pow(2, retryAttempt))); - - private readonly AsyncRetryPolicy batchNodeNotReadyRetryPolicy; + private readonly AsyncRetryPolicy batchRetryPolicy; private readonly ILogger logger; private readonly BatchClient batchClient; @@ -67,15 +66,18 @@ public partial class AzureProxy : IAzureProxy /// /// The Azure Batch Account options /// + /// Retry policy options /// The logger /// - public AzureProxy(IOptions batchAccountOptions, IBatchPoolManager batchPoolManager, ILogger logger) + public AzureProxy(IOptions batchAccountOptions, IBatchPoolManager batchPoolManager, IOptions retryPolicyOptions, ILogger logger) { ArgumentNullException.ThrowIfNull(batchAccountOptions); - ArgumentNullException.ThrowIfNull(logger); ArgumentNullException.ThrowIfNull(batchPoolManager); + ArgumentNullException.ThrowIfNull(retryPolicyOptions); + ArgumentNullException.ThrowIfNull(logger); this.batchPoolManager = batchPoolManager; + this.logger = logger; if (string.IsNullOrWhiteSpace(batchAccountOptions.Value.AccountName)) { @@ -83,20 +85,11 @@ public AzureProxy(IOptions batchAccountOptions, IBatchPoolM throw new InvalidOperationException("The batch account name is missing from the the configuration."); } - this.logger = logger; - - this.batchNodeNotReadyRetryPolicy = Policy - .Handle(ex => "NodeNotReady".Equals(ex.RequestInformation?.BatchError?.Code, StringComparison.InvariantCultureIgnoreCase)) - .WaitAndRetryAsync( - 5, - (retryAttempt, exception, _) => (exception as BatchException).RequestInformation?.RetryAfter ?? TimeSpan.FromSeconds(Math.Pow(2, retryAttempt)), - (exception, delay, retryAttempt, _) => - { - var requestId = (exception as BatchException).RequestInformation?.ServiceRequestId; - var reason = (exception.InnerException as Microsoft.Azure.Batch.Protocol.Models.BatchErrorException)?.Response?.ReasonPhrase; - logger.LogDebug(exception, "Retry attempt {RetryAttempt} after delay {DelaySeconds} for NodeNotReady exception: ServiceRequestId: {ServiceRequestId}, BatchErrorCode: NodeNotReady, Reason: {ReasonPhrase}", retryAttempt, delay.TotalSeconds, requestId, reason); - return Task.FromResult(false); - }); + batchRetryPolicy = Policy + .Handle() + .WaitAndRetryAsync(retryPolicyOptions.Value.MaxRetryCount, + (attempt, exception, ctx) => (exception as BatchException)?.RequestInformation?.RetryAfter ?? TimeSpan.FromSeconds(Math.Pow(retryPolicyOptions.Value.ExponentialBackOffExponent, attempt)), + (outcome, timespan, retryCount, ctx) => { RetryHandler.OnRetry(outcome, timespan, retryCount, ctx); return Task.CompletedTask; }); if (!string.IsNullOrWhiteSpace(batchAccountOptions.Value.AppKey)) { @@ -127,6 +120,50 @@ public AzureProxy(IOptions batchAccountOptions, IBatchPoolM //} } + /// + /// Rethrows exception if exception is and the Batch API Error Code returned otherwise invokes . + /// + /// Polly retry handler. + /// + private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenJobNotFound(RetryHandler.OnRetryHandler OnRetry) + => new((outcome, timespan, retryCount, correlationId) => + { + if (outcome is BatchException batchException && batchException.RequestInformation?.BatchError?.Code != BatchErrorCodeStrings.JobNotFound) + { + System.Runtime.ExceptionServices.ExceptionDispatchInfo.Capture(outcome).Throw(); + } + OnRetry?.Invoke(outcome, timespan, retryCount, correlationId); + }); + + /// + /// Rethrows exception if exception is and the Batch API Error Code returned "NodeNotReady" otherwise invokes . + /// + /// Polly retry handler. + /// + private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenNodeNotReady(RetryHandler.OnRetryHandler OnRetry) + => new((outcome, timespan, retryCount, correlationId) => + { + if (outcome is BatchException batchException && !"NodeNotReady".Equals(batchException.RequestInformation?.BatchError?.Code, StringComparison.InvariantCultureIgnoreCase)) + { + System.Runtime.ExceptionServices.ExceptionDispatchInfo.Capture(outcome).Throw(); + } + OnRetry?.Invoke(outcome, timespan, retryCount, correlationId); + }); + + /// + /// A logging Polly retry handler. + /// + /// Calling method name. + /// + private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string caller = default) + => new((exception, timeSpan, retryCount, correlationId) => + { + var requestId = (exception as BatchException)?.RequestInformation?.ServiceRequestId; + var reason = (exception.InnerException as Microsoft.Azure.Batch.Protocol.Models.BatchErrorException)?.Response?.ReasonPhrase; + logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan} BatchErrorCode: '{BatchErrorCode}', ApiStatusCode '{ApiStatusCode}', Reason: '{ReasonPhrase}' ServiceRequestId: '{ServiceRequestId}', CorrelationId: {CorrelationId:D}", + caller, retryCount, timeSpan, (exception as BatchException)?.RequestInformation?.BatchError?.Code, (exception as BatchException)?.RequestInformation?.HttpStatusCode, reason, requestId, correlationId); + }); + // TODO: Static method because the instrumentation key is needed in both Program.cs and Startup.cs and we wanted to avoid intializing the batch client twice. // Can we skip initializing app insights with a instrumentation key in Program.cs? If yes, change this to an instance method. /// @@ -231,9 +268,11 @@ public async Task CreateAutoPoolModeBatchJobAsync(string jobId, CloudTask cloudT try { logger.LogInformation($"TES task: {cloudTask.Id} adding task to job."); - job = await batchRaceConditionJobNotFoundRetryPolicy.ExecuteAsync(ct => + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenJobNotFound(LogRetryErrorOnRetryHandler())); + job = await batchRetryPolicy.ExecuteAsync((_, ct) => batchClient.JobOperations.GetJobAsync(job.Id, cancellationToken: ct), - cancellationToken); + ctx, cancellationToken); await job.AddTaskAsync(cloudTask, cancellationToken: cancellationToken); logger.LogInformation($"TES task: {cloudTask.Id} added task successfully."); @@ -277,9 +316,11 @@ public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, PoolI ArgumentException.ThrowIfNullOrEmpty(poolInformation?.PoolId, nameof(poolInformation)); logger.LogInformation("TES task: {TesTask} - Adding task to job {BatchJob}", tesTaskId, poolInformation.PoolId); - var job = await batchRaceConditionJobNotFoundRetryPolicy.ExecuteAsync(ct => - batchClient.JobOperations.GetJobAsync(poolInformation.PoolId, cancellationToken: ct), - cancellationToken); + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenJobNotFound(LogRetryErrorOnRetryHandler())); + var job = await batchRetryPolicy.ExecuteAsync((_, ct) => + batchClient.JobOperations.GetJobAsync(poolInformation.PoolId, cancellationToken: ct), + ctx, cancellationToken); await job.AddTaskAsync(cloudTask, cancellationToken: cancellationToken); logger.LogInformation("TES task: {TesTask} - Added task successfully", tesTaskId); @@ -523,7 +564,9 @@ public async Task DeleteBatchTaskAsync(string tesTaskId, PoolInformation pool, C foreach (var task in batchTasksToDelete) { logger.LogInformation("Deleting task {BatchTask}", task.Id); - await batchNodeNotReadyRetryPolicy.ExecuteAsync(ct => task.DeleteAsync(cancellationToken: ct), cancellationToken); + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenNodeNotReady(LogRetryErrorOnRetryHandler())); + await batchRetryPolicy.ExecuteAsync((_, ct) => task.DeleteAsync(cancellationToken: ct), ctx, cancellationToken); } } diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index f0cab3ea7..2ed9dff54 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -10,12 +10,10 @@ using Microsoft.Azure.Batch; using Microsoft.Azure.Batch.Common; using Microsoft.Extensions.Caching.Memory; -using Microsoft.Extensions.Options; +using Microsoft.Extensions.Logging; using Polly; -using Polly.Retry; using Tes.ApiClients; -using Tes.ApiClients.Options; -using TesApi.Web.Management.Configuration; +using TesApi.Web.Extensions; using TesApi.Web.Storage; using BatchModels = Microsoft.Azure.Management.Batch.Models; @@ -26,43 +24,80 @@ namespace TesApi.Web /// public class CachingWithRetriesAzureProxy : IAzureProxy { + private readonly ILogger logger; private readonly IAzureProxy azureProxy; private readonly CachingRetryHandler cachingRetryHandler; - private readonly AsyncRetryPolicy batchPoolOrJobCreateOrTaskAddHandler; /// /// Contructor to create a cache of /// /// - /// /// - public CachingWithRetriesAzureProxy(IAzureProxy azureProxy, IOptions retryPolicyOptions, CachingRetryHandler cachingRetryHandler) + /// + public CachingWithRetriesAzureProxy(IAzureProxy azureProxy, CachingRetryHandler cachingRetryHandler, ILogger logger) { ArgumentNullException.ThrowIfNull(azureProxy); ArgumentNullException.ThrowIfNull(cachingRetryHandler); this.cachingRetryHandler = cachingRetryHandler; this.azureProxy = azureProxy; + this.logger = logger; + } + + private static readonly string[] CreationErrorFoundCodes = new[] + { + BatchErrorCodeStrings.TaskExists, + BatchErrorCodeStrings.PoolExists, + BatchErrorCodeStrings.JobExists + }; - var creationErrorFoundCodes = new string[] + private static readonly string[] DeletionErrorFoundCodes = new[] + { + BatchErrorCodeStrings.TaskNotFound, + BatchErrorCodeStrings.PoolNotFound, + BatchErrorCodeStrings.JobNotFound + }; + + /// + /// Rethrows exception if exception is and the Batch API call returned otherwise invokes . + /// + /// Polly retry handler. + /// + private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(RetryHandler.OnRetryHandler OnRetry) + => new((outcome, timespan, retryCount, correlationId) => { - BatchErrorCodeStrings.TaskExists, - BatchErrorCodeStrings.PoolExists, - BatchErrorCodeStrings.JobExists - }; + if (outcome is BatchException batchException && CreationErrorFoundCodes.Contains(batchException.RequestInformation?.BatchError?.Code, StringComparer.OrdinalIgnoreCase)) + { + ExceptionDispatchInfo.Capture(outcome).Throw(); + } + OnRetry?.Invoke(outcome, timespan, retryCount, correlationId); + }); - batchPoolOrJobCreateOrTaskAddHandler = Policy - .Handle() - .WaitAndRetryAsync(retryPolicyOptions.Value.MaxRetryCount, - (attempt) => TimeSpan.FromSeconds(Math.Pow(retryPolicyOptions.Value.ExponentialBackOffExponent, attempt)), - (exception, timeSpan) => - { - if (exception is BatchException batchException && creationErrorFoundCodes.Contains(batchException.RequestInformation?.BatchError?.Code, StringComparer.OrdinalIgnoreCase)) - { - ExceptionDispatchInfo.Capture(exception).Throw(); - } - }); - } + /// + /// Rethrows exception if exception is and the Batch API call returned otherwise invokes . + /// + /// Polly retry handler. + /// + private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(RetryHandler.OnRetryHandler OnRetry) + => new((outcome, timespan, retryCount, correlationId) => + { + if (outcome is BatchException batchException && DeletionErrorFoundCodes.Contains(batchException.RequestInformation?.BatchError?.Code, StringComparer.OrdinalIgnoreCase)) + { + ExceptionDispatchInfo.Capture(outcome).Throw(); + } + OnRetry?.Invoke(outcome, timespan, retryCount, correlationId); + }); + + /// + /// A logging Polly retry handler. + /// + /// Calling method name. + /// + private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string caller = default) + => new((exception, timeSpan, retryCount, correlationId) => + { + logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan} CorrelationId: {CorrelationId:D}", caller, retryCount, timeSpan, correlationId); + }); /// @@ -73,7 +108,9 @@ public async Task CreateBatchJobAsync(PoolInformation poolInformation, Cancellat { try { - await batchPoolOrJobCreateOrTaskAddHandler.ExecuteAsync(ct => azureProxy.CreateBatchJobAsync(poolInformation, ct), cancellationToken); + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(LogRetryErrorOnRetryHandler())); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.CreateBatchJobAsync(poolInformation, ct), cancellationToken, ctx); } catch (BatchException exc) when (BatchErrorCodeStrings.JobExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } @@ -84,29 +121,81 @@ public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, PoolI { try { - await batchPoolOrJobCreateOrTaskAddHandler.ExecuteAsync(ct => azureProxy.AddBatchTaskAsync(tesTaskId, cloudTask, poolInformation, ct), cancellationToken); + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(LogRetryErrorOnRetryHandler())); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.AddBatchTaskAsync(tesTaskId, cloudTask, poolInformation, ct), cancellationToken, ctx); } catch (BatchException exc) when (BatchErrorCodeStrings.TaskExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } } /// - public Task DeleteBatchJobAsync(PoolInformation poolInformation, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(poolInformation, ct), cancellationToken); + public async Task DeleteBatchJobAsync(PoolInformation poolInformation, CancellationToken cancellationToken) + { + try + { + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(LogRetryErrorOnRetryHandler())); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(poolInformation, ct), cancellationToken, ctx); + } + catch (BatchException exc) when (BatchErrorCodeStrings.JobNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) + { } + } /// - public Task DeleteBatchJobAsync(string taskId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(taskId, ct), cancellationToken); + public async Task DeleteBatchJobAsync(string taskId, CancellationToken cancellationToken) + { + try + { + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(LogRetryErrorOnRetryHandler())); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(taskId, ct), cancellationToken, ctx); + } + catch (BatchException exc) when (BatchErrorCodeStrings.JobNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) + { } + } /// - public Task DeleteBatchTaskAsync(string taskId, PoolInformation poolInformation, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(taskId, poolInformation, ct), cancellationToken); + public async Task DeleteBatchTaskAsync(string taskId, PoolInformation poolInformation, CancellationToken cancellationToken) + { + try + { + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(LogRetryErrorOnRetryHandler())); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(taskId, poolInformation, ct), cancellationToken, ctx); + } + catch (BatchException exc) when (BatchErrorCodeStrings.TaskNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) + { } + } /// - public Task DeleteBatchPoolAsync(string poolId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchPoolAsync(poolId, ct), cancellationToken); + public async Task DeleteBatchPoolAsync(string poolId, CancellationToken cancellationToken) + { + try + { + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(LogRetryErrorOnRetryHandler())); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchPoolAsync(poolId, ct), cancellationToken, ctx); + } + catch (BatchException exc) when (BatchErrorCodeStrings.TaskNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) + { } + } /// - public Task GetBatchPoolAsync(string poolId, CancellationToken cancellationToken, DetailLevel detailLevel) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchPoolAsync(poolId, ct, detailLevel), cancellationToken); + public Task GetBatchPoolAsync(string poolId, CancellationToken cancellationToken, DetailLevel detailLevel) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchPoolAsync(poolId, ct, detailLevel), cancellationToken, ctx); + } /// - public Task GetBatchJobAsync(string jobId, CancellationToken cancellationToken, DetailLevel detailLevel) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAsync(jobId, ct, detailLevel), cancellationToken); + public Task GetBatchJobAsync(string jobId, CancellationToken cancellationToken, DetailLevel detailLevel) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAsync(jobId, ct, detailLevel), cancellationToken, ctx); + } /// public async Task DeleteBatchComputeNodesAsync(string poolId, IEnumerable computeNodes, CancellationToken cancellationToken) @@ -116,40 +205,94 @@ public async Task DeleteBatchComputeNodesAsync(string poolId, IEnumerable - public Task DownloadBlobAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DownloadBlobAsync(blobAbsoluteUri, ct), cancellationToken); + public Task DownloadBlobAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DownloadBlobAsync(blobAbsoluteUri, ct), cancellationToken, ctx); + } /// - public Task BlobExistsAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.BlobExistsAsync(blobAbsoluteUri, ct), cancellationToken); + public Task BlobExistsAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.BlobExistsAsync(blobAbsoluteUri, ct), cancellationToken, ctx); + } /// - public Task> GetActivePoolIdsAsync(string prefix, TimeSpan minAge, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetActivePoolIdsAsync(prefix, minAge, ct), cancellationToken); + public Task> GetActivePoolIdsAsync(string prefix, TimeSpan minAge, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetActivePoolIdsAsync(prefix, minAge, ct), cancellationToken, ctx); + } /// - public IAsyncEnumerable GetActivePoolsAsync(string hostName) => cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetActivePoolsAsync(hostName)); + public IAsyncEnumerable GetActivePoolsAsync(string hostName) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetActivePoolsAsync(hostName), ctx); + } /// - public int GetBatchActiveJobCount() => cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetBatchActiveJobCount()); + public int GetBatchActiveJobCount() + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActiveJobCount, ctx); + } /// - public IEnumerable GetBatchActiveNodeCountByVmSize() => cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetBatchActiveNodeCountByVmSize()); + public IEnumerable GetBatchActiveNodeCountByVmSize() + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActiveNodeCountByVmSize, ctx); + } /// - public int GetBatchActivePoolCount() => cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetBatchActivePoolCount()); + public int GetBatchActivePoolCount() + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActivePoolCount, ctx); + } /// - public Task GetBatchJobAndTaskStateAsync(Tes.Models.TesTask tesTask, bool usingAutoPools, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAndTaskStateAsync(tesTask, usingAutoPools, ct), cancellationToken); + public Task GetBatchJobAndTaskStateAsync(Tes.Models.TesTask tesTask, bool usingAutoPools, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAndTaskStateAsync(tesTask, usingAutoPools, ct), cancellationToken, ctx); + } /// - public Task GetNextBatchJobIdAsync(string tesTaskId, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetNextBatchJobIdAsync(tesTaskId, ct), cancellationToken); + public Task GetNextBatchJobIdAsync(string tesTaskId, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetNextBatchJobIdAsync(tesTaskId, ct), cancellationToken, ctx); + } /// - public Task> GetPoolIdsReferencedByJobsAsync(CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(azureProxy.GetPoolIdsReferencedByJobsAsync, cancellationToken); + public Task> GetPoolIdsReferencedByJobsAsync(CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAsync(azureProxy.GetPoolIdsReferencedByJobsAsync, cancellationToken, ctx); + } /// public Task GetStorageAccountKeyAsync(StorageAccountInfo storageAccountInfo, CancellationToken cancellationToken) - => cachingRetryHandler.ExecuteWithRetryAndCachingAsync($"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountInfo.Id}", - ct => azureProxy.GetStorageAccountKeyAsync(storageAccountInfo, ct), DateTimeOffset.Now.AddHours(1), cancellationToken); + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAndCachingAsync($"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountInfo.Id}", + ct => azureProxy.GetStorageAccountKeyAsync(storageAccountInfo, ct), DateTimeOffset.Now.AddHours(1), cancellationToken, ctx); + } /// public async Task GetStorageAccountInfoAsync(string storageAccountName, CancellationToken cancellationToken) @@ -158,7 +301,9 @@ public async Task GetStorageAccountInfoAsync(string storageA if (storageAccountInfo is null) { - storageAccountInfo = await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetStorageAccountInfoAsync(storageAccountName, ct), cancellationToken); + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + storageAccountInfo = await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetStorageAccountInfoAsync(storageAccountName, ct), cancellationToken, ctx); if (storageAccountInfo is not null) { @@ -170,22 +315,52 @@ public async Task GetStorageAccountInfoAsync(string storageA } /// - public Task> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListBlobsAsync(directoryUri, ct), cancellationToken); + public Task> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListBlobsAsync(directoryUri, ct), cancellationToken, ctx); + } /// - public Task> ListOldJobsToDeleteAsync(TimeSpan oldestJobAge, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListOldJobsToDeleteAsync(oldestJobAge, ct), cancellationToken); + public Task> ListOldJobsToDeleteAsync(TimeSpan oldestJobAge, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListOldJobsToDeleteAsync(oldestJobAge, ct), cancellationToken, ctx); + } /// - public Task> ListOrphanedJobsToDeleteAsync(TimeSpan minJobAge, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListOrphanedJobsToDeleteAsync(minJobAge, ct), cancellationToken); + public Task> ListOrphanedJobsToDeleteAsync(TimeSpan minJobAge, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListOrphanedJobsToDeleteAsync(minJobAge, ct), cancellationToken, ctx); + } /// - public Task UploadBlobAsync(Uri blobAbsoluteUri, string content, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobAsync(blobAbsoluteUri, content, ct), cancellationToken); + public Task UploadBlobAsync(Uri blobAbsoluteUri, string content, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobAsync(blobAbsoluteUri, content, ct), cancellationToken, ctx); + } /// - public Task UploadBlobFromFileAsync(Uri blobAbsoluteUri, string filePath, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobFromFileAsync(blobAbsoluteUri, filePath, ct), cancellationToken); + public Task UploadBlobFromFileAsync(Uri blobAbsoluteUri, string filePath, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobFromFileAsync(blobAbsoluteUri, filePath, ct), cancellationToken, ctx); + } /// - public Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) => cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBlobPropertiesAsync(blobAbsoluteUri, ct), cancellationToken); + public Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBlobPropertiesAsync(blobAbsoluteUri, ct), cancellationToken, ctx); + } /// public bool LocalFileExists(string path) => azureProxy.LocalFileExists(path); @@ -201,7 +376,9 @@ public async Task CreateBatchPoolAsync(BatchModels.Pool poolInf { try { - return await batchPoolOrJobCreateOrTaskAddHandler.ExecuteAsync(ct => azureProxy.CreateBatchPoolAsync(poolInfo, isPreemptable, ct), cancellationToken); + var ctx = new Context(); + ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(LogRetryErrorOnRetryHandler())); + return await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.CreateBatchPoolAsync(poolInfo, isPreemptable, ct), cancellationToken, ctx); } catch (BatchException exc) when (BatchErrorCodeStrings.PoolExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { @@ -215,16 +392,31 @@ public Task DeleteBatchPoolIfExistsAsync(string poolId, CancellationToken cancel /// public Task GetFullAllocationStateAsync(string poolId, CancellationToken cancellationToken) - => cachingRetryHandler.ExecuteWithRetryAndCachingAsync( + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.ExecuteWithRetryAndCachingAsync( $"{nameof(CachingWithRetriesAzureProxy)}:{poolId}", ct => azureProxy.GetFullAllocationStateAsync(poolId, ct), - DateTimeOffset.Now.Add(BatchPoolService.RunInterval).Subtract(TimeSpan.FromSeconds(1)), cancellationToken); + DateTimeOffset.Now.Add(BatchPoolService.RunInterval).Subtract(TimeSpan.FromSeconds(1)), + cancellationToken, ctx); + } /// - public IAsyncEnumerable ListComputeNodesAsync(string poolId, DetailLevel detailLevel) => cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListComputeNodesAsync(poolId, detailLevel), cachingRetryHandler.RetryPolicy); + public IAsyncEnumerable ListComputeNodesAsync(string poolId, DetailLevel detailLevel) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListComputeNodesAsync(poolId, detailLevel), cachingRetryHandler.RetryPolicy, ctx); + } /// - public IAsyncEnumerable ListTasksAsync(string jobId, DetailLevel detailLevel) => cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListTasksAsync(jobId, detailLevel), cachingRetryHandler.RetryPolicy); + public IAsyncEnumerable ListTasksAsync(string jobId, DetailLevel detailLevel) + { + var ctx = new Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListTasksAsync(jobId, detailLevel), cachingRetryHandler.RetryPolicy, ctx); + } /// public Task DisableBatchPoolAutoScaleAsync(string poolId, CancellationToken cancellationToken) => azureProxy.DisableBatchPoolAutoScaleAsync(poolId, cancellationToken); diff --git a/src/TesApi.Web/PagedInterfaceExtensions.cs b/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs similarity index 92% rename from src/TesApi.Web/PagedInterfaceExtensions.cs rename to src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs index 0c7efd779..d594cb0ac 100644 --- a/src/TesApi.Web/PagedInterfaceExtensions.cs +++ b/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs @@ -11,7 +11,7 @@ using Polly.Retry; // TODO: move this to Common.csproj? -namespace TesApi.Web +namespace TesApi.Web.Extensions { /// /// Extension methods and implementations for enumerating paged enumeration/collection types from Azure @@ -57,14 +57,16 @@ public static IAsyncEnumerable ToAsyncEnumerable(this IPage source, Fun /// Policy retrying calls made while enumerating results returned by . /// Method returning . /// Policy retrying call to . + /// An optional . /// - public static IAsyncEnumerable ExecuteAsync(this AsyncRetryPolicy asyncRetryPolicy, Func> func, RetryPolicy retryPolicy) + public static IAsyncEnumerable ExecuteAsync(this AsyncRetryPolicy asyncRetryPolicy, Func> func, RetryPolicy retryPolicy, Polly.Context ctx = default) { ArgumentNullException.ThrowIfNull(asyncRetryPolicy); ArgumentNullException.ThrowIfNull(func); ArgumentNullException.ThrowIfNull(retryPolicy); - return new PollyAsyncEnumerable((retryPolicy).Execute(() => func()), asyncRetryPolicy); + ctx ??= new(); + return new PollyAsyncEnumerable((retryPolicy).Execute(() => func()), asyncRetryPolicy, ctx); } #region Implementation classes @@ -101,18 +103,21 @@ private sealed class PollyAsyncEnumerable : IAsyncEnumerable { private readonly IAsyncEnumerable _source; private readonly AsyncRetryPolicy _retryPolicy; + private readonly Polly.Context _ctx; - public PollyAsyncEnumerable(IAsyncEnumerable source, AsyncRetryPolicy retryPolicy) + public PollyAsyncEnumerable(IAsyncEnumerable source, AsyncRetryPolicy retryPolicy, Polly.Context ctx) { ArgumentNullException.ThrowIfNull(source); ArgumentNullException.ThrowIfNull(retryPolicy); + ArgumentNullException.ThrowIfNull(ctx); _source = source; _retryPolicy = retryPolicy; + _ctx = ctx; } IAsyncEnumerator IAsyncEnumerable.GetAsyncEnumerator(CancellationToken cancellationToken) - => new PollyAsyncEnumerator(_source.GetAsyncEnumerator(cancellationToken), _retryPolicy, cancellationToken); + => new PollyAsyncEnumerator(_source.GetAsyncEnumerator(cancellationToken), _retryPolicy, _ctx, cancellationToken); } private sealed class PollyAsyncEnumerator : IAsyncEnumerator @@ -120,14 +125,17 @@ private sealed class PollyAsyncEnumerator : IAsyncEnumerator private readonly IAsyncEnumerator _source; private readonly AsyncRetryPolicy _retryPolicy; private readonly CancellationToken _cancellationToken; + private readonly Polly.Context _ctx; - public PollyAsyncEnumerator(IAsyncEnumerator source, AsyncRetryPolicy retryPolicy, CancellationToken cancellationToken) + public PollyAsyncEnumerator(IAsyncEnumerator source, AsyncRetryPolicy retryPolicy, Polly.Context ctx, CancellationToken cancellationToken) { ArgumentNullException.ThrowIfNull(source); ArgumentNullException.ThrowIfNull(retryPolicy); + ArgumentNullException.ThrowIfNull(ctx); _source = source; _retryPolicy = retryPolicy; + _ctx = ctx; _cancellationToken = cancellationToken; } @@ -138,7 +146,7 @@ ValueTask IAsyncDisposable.DisposeAsync() => _source.DisposeAsync(); ValueTask IAsyncEnumerator.MoveNextAsync() - => new(_retryPolicy.ExecuteAsync(ct => _source.MoveNextAsync(ct).AsTask(), _cancellationToken)); + => new(_retryPolicy.ExecuteAsync((_, ct) => _source.MoveNextAsync(ct).AsTask(), new(_ctx.OperationKey, _ctx), _cancellationToken)); } private sealed class PageEnumerator : EnumeratorEnumerator> diff --git a/src/TesApi.Web/Management/ArmResourceInformationFinder.cs b/src/TesApi.Web/Management/ArmResourceInformationFinder.cs index 1ec0ca972..e047747eb 100644 --- a/src/TesApi.Web/Management/ArmResourceInformationFinder.cs +++ b/src/TesApi.Web/Management/ArmResourceInformationFinder.cs @@ -9,6 +9,7 @@ using Microsoft.Azure.Management.Batch; using Microsoft.Azure.Services.AppAuthentication; using Microsoft.Rest; +using TesApi.Web.Extensions; namespace TesApi.Web.Management { diff --git a/src/TesApi.Web/Management/ContainerRegistryProvider.cs b/src/TesApi.Web/Management/ContainerRegistryProvider.cs index 4f29a8e53..0fcede6d7 100644 --- a/src/TesApi.Web/Management/ContainerRegistryProvider.cs +++ b/src/TesApi.Web/Management/ContainerRegistryProvider.cs @@ -99,7 +99,10 @@ public bool IsImagePublic(string imageName) private async Task LookUpAndAddToCacheContainerRegistryInfoAsync(string imageName, CancellationToken cancellationToken) { - var repositories = await CachingRetryHandler.ExecuteWithRetryAsync(GetAccessibleContainerRegistriesAsync, cancellationToken: cancellationToken); + var ctx = new Polly.Context(); + ctx.SetOnRetryHandler((outcome, timespan, retryCount, correlationId) => + Logger.LogError(outcome, @"Retrying in {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan} CorrelationId: {CorrelationId:D}", nameof(LookUpAndAddToCacheContainerRegistryInfoAsync), retryCount, timespan, correlationId)); + var repositories = await CachingRetryHandler.ExecuteWithRetryAsync(GetAccessibleContainerRegistriesAsync, cancellationToken: cancellationToken, context: ctx); var requestedRepo = repositories?.FirstOrDefault(reg => reg.RegistryServer.Equals(imageName.Split('/').FirstOrDefault(), StringComparison.OrdinalIgnoreCase)); From 60ba2339568820d60e03f3018b9895b479c64789 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 10 Nov 2023 09:57:01 -0800 Subject: [PATCH 082/202] Address feedback --- src/Tes.ApiClients/HttpApiClient.cs | 54 +++++++++---------- src/TesApi.Web/AzureProxy.cs | 8 +-- .../CachingWithRetriesAzureProxy.cs | 3 +- .../Management/ContainerRegistryProvider.cs | 18 +++---- 4 files changed, 39 insertions(+), 44 deletions(-) diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index 03b2cd3b1..53eeb4bec 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -77,25 +77,26 @@ protected HttpApiClient() { } private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string? caller = default) => new((exception, timeSpan, retryCount, correlationId) => { - Logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan} CorrelationId: {CorrelationId:D}", caller, retryCount, timeSpan, correlationId); + Logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan} CorrelationId: {CorrelationId}", caller, retryCount, timeSpan, correlationId.ToString("D")); }); /// /// A logging Polly retry handler. /// - /// See /// Calling method name. - /// - private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string? caller = default) + /// + private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHttpResponseMessageHandler([System.Runtime.CompilerServices.CallerMemberName] string? caller = default) => new((result, timeSpan, retryCount, correlationId) => { if (result.Exception is null) { - Logger?.LogError(@"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan} CorrelationId: {CorrelationId:D}", caller, retryCount, timeSpan, correlationId); + Logger?.LogError(@"Retrying in {Method} due to HTTP status {HttpStatus}: RetryCount: {RetryCount} TimeSpan: {TimeSpan} CorrelationId: {CorrelationId}", + caller, result.Result.StatusCode.ToString("G"), retryCount, timeSpan.ToString("c"), correlationId.ToString("D")); } else { - Logger?.LogError(result.Exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan} CorrelationId: {CorrelationId:D}", caller, retryCount, timeSpan, correlationId); + Logger?.LogError(result.Exception, @"Retrying in {Method} due to '{Message}': RetryCount: {RetryCount} TimeSpan: {TimeSpan} CorrelationId: {CorrelationId}", + caller, result.Exception.Message, retryCount, timeSpan.ToString("c"), correlationId.ToString("D")); } }); @@ -111,7 +112,7 @@ protected async Task HttpSendRequestWithRetryPolicyAsync( Func httpRequestFactory, CancellationToken cancellationToken, bool setAuthorizationHeader = false) { var ctx = new Polly.Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHttpResponseMessageHandler()); return await cachingRetryHandler.ExecuteWithRetryAsync(async ct => { var request = httpRequestFactory(); @@ -171,20 +172,8 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri re { var cacheKey = await ToCacheKeyAsync(requestUrl, setAuthorizationHeader, cancellationToken); - return (await cachingRetryHandler.AppCache.GetOrCreateAsync(cacheKey, async _ => - { - var ctx = new Polly.Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - var response = await cachingRetryHandler.ExecuteWithRetryAsync(async ct => - { - var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); - - var httpResponse = await HttpClient.SendAsync(httpRequest, ct); - return httpResponse.EnsureSuccessStatusCode(); - }, cancellationToken, ctx); - - return await ReadResponseBodyAsync(response, cancellationToken); - }))!; + return (await cachingRetryHandler.AppCache.GetOrCreateAsync(cacheKey, + async _ => await HttpGetRequestWithRetryPolicyAsync(requestUrl, cancellationToken, setAuthorizationHeader)))!; } /// @@ -197,18 +186,23 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri re protected async Task HttpGetRequestWithRetryPolicyAsync(Uri requestUrl, CancellationToken cancellationToken, bool setAuthorizationHeader = false) { - var ctx = new Polly.Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - var response = await cachingRetryHandler.ExecuteWithRetryAsync(async ct => + var context = new Polly.Context(); + context.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); + return await cachingRetryHandler.ExecuteWithRetryAsync(async token => { - //request must be recreated in every retry. - var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); + var ctx = new Polly.Context(); + ctx.SetOnRetryHandler(LogRetryErrorOnRetryHttpResponseMessageHandler()); + var response = await cachingRetryHandler.ExecuteWithRetryAsync(async ct => + { + //request must be recreated in every retry. + var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); - var httpResponse = await HttpClient.SendAsync(httpRequest, ct); - return httpResponse.EnsureSuccessStatusCode(); - }, cancellationToken, ctx); + var httpResponse = await HttpClient.SendAsync(httpRequest, ct); + return httpResponse.EnsureSuccessStatusCode(); + }, token, ctx); - return await ReadResponseBodyAsync(response, cancellationToken); + return await ReadResponseBodyAsync(response, token); + }, cancellationToken, context); } /// diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 15fc6f456..6433d7f5c 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -158,10 +158,10 @@ private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatch private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string caller = default) => new((exception, timeSpan, retryCount, correlationId) => { - var requestId = (exception as BatchException)?.RequestInformation?.ServiceRequestId; - var reason = (exception.InnerException as Microsoft.Azure.Batch.Protocol.Models.BatchErrorException)?.Response?.ReasonPhrase; - logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan} BatchErrorCode: '{BatchErrorCode}', ApiStatusCode '{ApiStatusCode}', Reason: '{ReasonPhrase}' ServiceRequestId: '{ServiceRequestId}', CorrelationId: {CorrelationId:D}", - caller, retryCount, timeSpan, (exception as BatchException)?.RequestInformation?.BatchError?.Code, (exception as BatchException)?.RequestInformation?.HttpStatusCode, reason, requestId, correlationId); + var requestId = (exception as BatchException)?.RequestInformation?.ServiceRequestId ?? "n/a"; + var reason = (exception.InnerException as Microsoft.Azure.Batch.Protocol.Models.BatchErrorException)?.Response?.ReasonPhrase ?? "n/a"; + logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan} BatchErrorCode: '{BatchErrorCode}', ApiStatusCode '{ApiStatusCode}', Reason: '{ReasonPhrase}' ServiceRequestId: '{ServiceRequestId}', CorrelationId: {CorrelationId}", + caller, retryCount, timeSpan.ToString("c"), (exception as BatchException)?.RequestInformation?.BatchError?.Code ?? "n/a", (exception as BatchException)?.RequestInformation?.HttpStatusCode?.ToString("G") ?? "n/a", reason, requestId, correlationId.ToString("D")); }); // TODO: Static method because the instrumentation key is needed in both Program.cs and Startup.cs and we wanted to avoid intializing the batch client twice. diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 2ed9dff54..ac8f8b246 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -96,7 +96,8 @@ private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatch private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string caller = default) => new((exception, timeSpan, retryCount, correlationId) => { - logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan} CorrelationId: {CorrelationId:D}", caller, retryCount, timeSpan, correlationId); + logger?.LogError(exception, @"Retrying in {Method} due to '{Message}': RetryCount: {RetryCount} RetryCount: {TimeSpan} CorrelationId: {CorrelationId}", + caller, exception.Message, retryCount, timeSpan.ToString("c"), correlationId.ToString("D")); }); diff --git a/src/TesApi.Web/Management/ContainerRegistryProvider.cs b/src/TesApi.Web/Management/ContainerRegistryProvider.cs index 0fcede6d7..fb9810d8b 100644 --- a/src/TesApi.Web/Management/ContainerRegistryProvider.cs +++ b/src/TesApi.Web/Management/ContainerRegistryProvider.cs @@ -100,8 +100,9 @@ public bool IsImagePublic(string imageName) private async Task LookUpAndAddToCacheContainerRegistryInfoAsync(string imageName, CancellationToken cancellationToken) { var ctx = new Polly.Context(); - ctx.SetOnRetryHandler((outcome, timespan, retryCount, correlationId) => - Logger.LogError(outcome, @"Retrying in {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan} CorrelationId: {CorrelationId:D}", nameof(LookUpAndAddToCacheContainerRegistryInfoAsync), retryCount, timespan, correlationId)); + ctx.SetOnRetryHandler((exception, timespan, retryCount, correlationId) => + Logger.LogError(exception, @"Retrying in {Method} due to '{Message}': RetryCount: {RetryCount} RetryCount: {TimeSpan} CorrelationId: {CorrelationId}", + nameof(LookUpAndAddToCacheContainerRegistryInfoAsync), exception.Message, retryCount, timespan.ToString("c"), correlationId.ToString("D"))); var repositories = await CachingRetryHandler.ExecuteWithRetryAsync(GetAccessibleContainerRegistriesAsync, cancellationToken: cancellationToken, context: ctx); var requestedRepo = repositories?.FirstOrDefault(reg => @@ -109,12 +110,12 @@ private async Task LookUpAndAddToCacheContainerRegistryIn if (requestedRepo is not null) { - Logger.LogInformation($"Requested repository: {imageName} was found."); + Logger.LogInformation(@"Requested repository: {DockerImage} was found.", imageName); CachingRetryHandler.AppCache.Set($"{nameof(ContainerRegistryProvider)}:{imageName}", requestedRepo, DateTimeOffset.UtcNow.AddHours(options.RegistryInfoCacheExpirationInHours)); } else { - Logger.LogWarning($"The TES service did not find the requested repository: {imageName}"); + Logger.LogWarning(@"The TES service did not find the requested repository: {DockerImage}", imageName); } return requestedRepo; @@ -158,11 +159,11 @@ private async Task> GetAccessibleContainerReg { var registries = (await azureClient.WithSubscription(subId).ContainerRegistries.ListAsync(cancellationToken: cancellationToken)).ToList(); - Logger.LogInformation(@$"Searching {subId} for container registries."); + Logger.LogInformation(@"Searching {SubscriptionId} for container registries.", subId); foreach (var r in registries) { - Logger.LogInformation(@$"Found {r.Name}. AdminUserEnabled: {r.AdminUserEnabled}"); + Logger.LogInformation(@"Found {RegistryName}. AdminUserEnabled: {RegistryAdminUserEnabled}", r.Name, r.AdminUserEnabled); try { @@ -174,13 +175,13 @@ private async Task> GetAccessibleContainerReg } catch (Exception ex) { - Logger.LogWarning($"TES service doesn't have permission to get credentials for registry {r.LoginServerUrl}. Please verify that 'Admin user' is enabled in the 'Access Keys' area in the Azure Portal for this container registry. Exception: {ex}"); + Logger.LogWarning(ex, @"TES service doesn't have permission to get credentials for registry {RegistryLoginServerUrl}. Please verify that 'Admin user' is enabled in the 'Access Keys' area in the Azure Portal for this container registry. Exception: ({ExceptionType}): {ExceptionMessage}", r.LoginServerUrl, ex.GetType().FullName, ex.Message); } } } catch (Exception ex) { - Logger.LogWarning($"TES service doesn't have permission to list container registries in subscription {subId}. Exception: {ex}"); + Logger.LogWarning(@"TES service doesn't have permission to list container registries in subscription {SubscriptionId}. Exception: ({ExceptionType}): {ExceptionMessage}", subId, ex.GetType().FullName, ex.Message); } } @@ -188,6 +189,5 @@ private async Task> GetAccessibleContainerReg return infos; } - } } From 42ff99f5146c890a04cd0667c4554a857a8d70e1 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 10 Nov 2023 17:02:08 -0800 Subject: [PATCH 083/202] Refactor for clarity and address issues --- src/TesApi.Web/BatchPool.cs | 10 +- src/TesApi.Web/BatchScheduler.cs | 4 +- src/TesApi.Web/IBatchPool.cs | 10 +- .../OrchestrateOnBatchSchedulerServiceBase.cs | 4 +- src/TesApi.Web/PoolScheduler.cs | 107 +++++++------ src/TesApi.Web/TaskScheduler.cs | 148 +++++++++--------- src/TesApi.Web/appsettings.json | 4 +- 7 files changed, 152 insertions(+), 135 deletions(-) diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index c34c8173d..e74f487ea 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -624,14 +624,16 @@ async ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken ca } /// - public IAsyncEnumerable GetTasksToDelete(CancellationToken cancellationToken) + public IAsyncEnumerable GetTasksToDelete(DateTime now, CancellationToken cancellationToken) { - return GetTasksAsync("creationTime,id", $"state eq 'completed' and creationTime lt datetime'{DateTimeOffset.UtcNow - TimeSpan.FromMinutes(10):O}'").Select(task => new IBatchScheduler.CloudTaskId(Id, task.Id, task.CreationTime.Value)); + now = now.ToUniversalTime(); + return GetTasksAsync("creationTime,id", $"state eq 'completed' and creationTime lt datetime'{now - TimeSpan.FromMinutes(10):O}' and stateTransitionTime gt datetime'{now + TimeSpan.FromMinutes(3)}'").Select(task => new IBatchScheduler.CloudTaskId(Id, task.Id, task.CreationTime.Value)); } /// - public async IAsyncEnumerable GetCloudTaskStatesAsync([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + public async IAsyncEnumerable GetCloudTaskStatesAsync(DateTime now, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { + now = now.ToUniversalTime(); List taskListWithComputeNodeInfo; // To check if the task was running when its node became preempted or unusable List activeTaskList; // These are candidates to be the victim of resizes or starttask failures List completedTaskList; // Backstop if events don't provide timely task completion information in a timely manner @@ -640,7 +642,7 @@ public async IAsyncEnumerable GetCloudTaskStatesAsync([ var taskList = await GetTasksAsync("executionInfo,id,nodeInfo,state,stateTransitionTime", null).ToListAsync(cancellationToken); taskListWithComputeNodeInfo = taskList.Where(task => !TaskState.Completed.Equals(task.State) && !string.IsNullOrEmpty(task.ComputeNodeInformation?.ComputeNodeId)).ToList(); activeTaskList = taskList.Where(task => TaskState.Active.Equals(task.State)).OrderByDescending(task => task.StateTransitionTime).ToList(); - completedTaskList = taskList.Where(task => TaskState.Completed.Equals(task.State) && task.StateTransitionTime < DateTime.UtcNow - TimeSpan.FromMinutes(2)).ToList(); + completedTaskList = taskList.Where(task => TaskState.Completed.Equals(task.State) && task.StateTransitionTime < now - TimeSpan.FromMinutes(2)).ToList(); } await foreach (var node in _azureProxy.ListComputeNodesAsync(Id, new ODATADetailLevel(filterClause: @"state eq 'preempted' or state eq 'unusable'", selectClause: @"errors,id,state")).WithCancellation(cancellationToken)) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 37a3f3ebe..37c4ca0c9 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -353,7 +353,7 @@ Task HandleInfoUpdate(TesTask tesTask, CombinedBatchTaskInfo batchInfo, Ca }.AsReadOnly(); } - private async Task DeleteTerminatedTaskAsync(string taskId, string jobId, DateTime taskCreated, CancellationToken cancellationToken) + private async Task DeleteCompletedTaskAsync(string taskId, string jobId, DateTime taskCreated, CancellationToken cancellationToken) { // https://learn.microsoft.com/azure/batch/best-practices#manage-task-lifetime var mins10 = TimeSpan.FromMinutes(10); @@ -498,7 +498,7 @@ async Task WrapHandleTesTaskTransitionAsync(TesTask tesTask, AzureBatchTas public IAsyncEnumerable> DeleteCloudTasksAsync(IAsyncEnumerable cloudTasks, CancellationToken cancellationToken) { #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously - return cloudTasks.SelectAwaitWithCancellation(async (task, cancellationToken) => new RelatedTask(DeleteTerminatedTaskAsync(task.TaskId, task.JobId, task.Created, cancellationToken), task)); + return cloudTasks.SelectAwaitWithCancellation(async (task, cancellationToken) => new RelatedTask(DeleteCompletedTaskAsync(task.TaskId, task.JobId, task.Created, cancellationToken), task)); #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously } diff --git a/src/TesApi.Web/IBatchPool.cs b/src/TesApi.Web/IBatchPool.cs index b23c3ec37..864ba17a3 100644 --- a/src/TesApi.Web/IBatchPool.cs +++ b/src/TesApi.Web/IBatchPool.cs @@ -61,16 +61,18 @@ public interface IBatchPool /// /// Gets compute node related task state information. /// - /// + /// Reference time. /// - IAsyncEnumerable GetCloudTaskStatesAsync(CancellationToken cancellationToken = default); + /// + IAsyncEnumerable GetCloudTaskStatesAsync(DateTime now, CancellationToken cancellationToken = default); /// /// Gets batch tasks that need to be deleted. /// - /// + /// Reference time. /// - IAsyncEnumerable GetTasksToDelete(CancellationToken cancellationToken = default); + /// + IAsyncEnumerable GetTasksToDelete(DateTime now, CancellationToken cancellationToken = default); /// /// Gets the last time the pool's compute node list was changed. diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 3afdefc73..6d7135118 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -192,9 +192,9 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< if (requestInfo?.BatchError is not null) { logs.Add($"BatchErrorCode: {requestInfo.BatchError.Code}"); - logs.Add($"BatchErrorMessage: {requestInfo.BatchError.Message}"); + logs.Add($"BatchErrorMessage ({requestInfo.BatchError.Message.Language}): {requestInfo.BatchError.Message.Value}"); - foreach (var detail in requestInfo.BatchError.Values?.Select(d => $"{d.Key}={d.Value}") ?? Enumerable.Empty()) + foreach (var detail in requestInfo.BatchError.Values?.Select(d => $"BatchErrorDetail: '{d.Key}': '{d.Value}'") ?? Enumerable.Empty()) { logs.Add(detail); } diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index 1475c3ef0..af748b474 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -2,12 +2,10 @@ // Licensed under the MIT License. using System; -using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Threading; using System.Threading.Tasks; -using Microsoft.Azure.Batch; using Microsoft.Extensions.Logging; using Tes.Models; using Tes.Repository; @@ -98,68 +96,83 @@ await ExecuteActionOnPoolsAsync( "Service Batch Pools", async (pool, token) => { + var now = DateTime.UtcNow; await pool.ServicePoolAsync(token); - await ProcessCloudTaskStatesAsync(pool.Id, pool.GetCloudTaskStatesAsync(token), token); - await ProcessDeletedTasks(pool.GetTasksToDelete(token), token); + await ProcessCloudTaskStatesAsync(pool.Id, pool.GetCloudTaskStatesAsync(now, token), token); + await ProcessDeletedTasks(pool.GetTasksToDelete(now, token), token); }, cancellationToken); - }, stoppingToken); - - async ValueTask ProcessCloudTaskStatesAsync(string poolId, IAsyncEnumerable states, CancellationToken cancellationToken) - { - var list = new List<(TesTask TesTask, AzureBatchTaskState State)>(); + }, + stoppingToken); + } - await foreach (var (cloudTaskId, state) in states.WithCancellation(cancellationToken)) - { - TesTask tesTask = default; - if (await repository.TryGetItemAsync(batchScheduler.GetTesTaskIdFromCloudTaskId(cloudTaskId), cancellationToken, task => tesTask = task) && tesTask is not null) - { - list.Add((tesTask, state)); - } - else - { - logger.LogDebug(@"Unable to locate TesTask for CloudTask '{CloudTask}' with action state {ActionState}.", cloudTaskId, state.State); - } - } + /// + /// Updates each task based on the provided states. + /// + /// The batch pool/job from which the state was obtained. + /// The states with which to update the associated tes tasks. + /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// + private async ValueTask ProcessCloudTaskStatesAsync(string poolId, IAsyncEnumerable states, CancellationToken cancellationToken) + { + var list = new List<(TesTask TesTask, AzureBatchTaskState State)>(); - if (list.Count != 0) + await foreach (var (cloudTaskId, state) in states.WithCancellation(cancellationToken)) + { + TesTask tesTask = default; + if (await repository.TryGetItemAsync(batchScheduler.GetTesTaskIdFromCloudTaskId(cloudTaskId), cancellationToken, task => tesTask = task) && tesTask is not null) { - await OrchestrateTesTasksOnBatchAsync( - $"NodeState ({poolId})", -#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously - async _ => list.Select(t => t.TesTask).ToAsyncEnumerable(), -#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously - (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, list.Select(t => t.State).ToArray(), token), - cancellationToken); + list.Add((tesTask, state)); } else { - logger.LogDebug("No task state changes from pool/node information this time: PoolId: {PoolId}.", poolId); + logger.LogDebug(@"Unable to locate TesTask for CloudTask '{CloudTask}' with action state {ActionState}.", cloudTaskId, state.State); } } - async ValueTask ProcessDeletedTasks(IAsyncEnumerable tasks, CancellationToken cancellationToken) + if (list.Count != 0) + { + await OrchestrateTesTasksOnBatchAsync( + $"NodeState ({poolId})", +#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously + async _ => list.Select(t => t.TesTask).ToAsyncEnumerable(), +#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, list.Select(t => t.State).ToArray(), token), + cancellationToken); + } + else + { + logger.LogDebug("No task state changes from pool/node information this time: PoolId: {PoolId}.", poolId); + } + } + + /// + /// Deletes cloud tasks. + /// + /// Tasks to delete. + /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// + private async ValueTask ProcessDeletedTasks(IAsyncEnumerable tasks, CancellationToken cancellationToken) + { + await foreach (var taskResult in batchScheduler.DeleteCloudTasksAsync(tasks, cancellationToken).WithCancellation(cancellationToken)) { - await foreach (var taskResult in batchScheduler.DeleteCloudTasksAsync(tasks, cancellationToken).WithCancellation(cancellationToken)) + try { - try - { - switch (await taskResult) - { - case true: - logger.LogDebug(@"Azure task {CloudTask} was deleted.", taskResult.Related.TaskId); - break; - - case false: - logger.LogDebug(@"Azure task {CloudTask} was NOT deleted.", taskResult.Related.TaskId); - break; - } - } - catch (Exception exc) + switch (await taskResult) { - logger.LogError(exc, @"Failed to delete azure task '{CloudTask}': '{ExceptionType}': '{ExceptionMessage}'", taskResult.Related.TaskId, exc.GetType().FullName, exc.Message); + case true: + logger.LogDebug(@"Azure task {CloudTask} was deleted.", taskResult.Related.TaskId); + break; + + case false: + logger.LogDebug(@"Azure task {CloudTask} was NOT deleted.", taskResult.Related.TaskId); + break; } } + catch (Exception exc) + { + logger.LogError(exc, @"Failed to delete azure task '{CloudTask}': '{ExceptionType}': '{ExceptionMessage}'", taskResult.Related.TaskId, exc.GetType().FullName, exc.Message); + } } } } diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index fd8dfc831..63c378420 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -124,83 +124,89 @@ private Task ExecuteCancelledTesTasksOnBatchAsync(CancellationToken stoppingToke private Task ExecuteUpdateTesTaskFromEventBlobAsync(CancellationToken stoppingToken) { return ExecuteActionOnIntervalAsync(blobRunInterval, - UpdateTesTasksFromEventBlobsAsync, + async cancellationToken => + await UpdateTesTasksFromAvailableEventsAsync( + await ParseAvailableEvents(cancellationToken), + cancellationToken), stoppingToken); } /// - /// Retrieves all event blobs from storage and updates the resultant state. + /// Determines the s from each event available for processing and their associated s. /// - /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. - /// - async ValueTask UpdateTesTasksFromEventBlobsAsync(CancellationToken stoppingToken) + /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// s and s from all events. + async ValueTask MarkProcessedAsync)>> ParseAvailableEvents(CancellationToken cancellationToken) { - var markEventsProcessedList = new ConcurrentBag>(); - Func> getEventsInOrder; + var messages = new ConcurrentBag<(RunnerEventsMessage Message, TesTask Task, AzureBatchTaskState State, Func MarkProcessedAsync)>(); + // Get and parse event blobs + await Parallel.ForEachAsync(batchScheduler.GetEventMessagesAsync(cancellationToken), cancellationToken, async (eventMessage, cancellationToken) => { - var messages = new ConcurrentBag<(RunnerEventsMessage Message, TesTask Task, AzureBatchTaskState State)>(); + var tesTask = await GetTesTaskAsync(eventMessage.Tags["task-id"], eventMessage.Tags["event-name"]); - // Get and parse event blobs - await Parallel.ForEachAsync(batchScheduler.GetEventMessagesAsync(stoppingToken), stoppingToken, async (eventMessage, cancellationToken) => + if (tesTask is null) { - var tesTask = await GetTesTaskAsync(eventMessage.Tags["task-id"], eventMessage.Tags["event-name"]); + return; + } - if (tesTask is null) - { - return; - } + try + { + nodeEventProcessor.ValidateMessageMetadata(eventMessage); + eventMessage = await nodeEventProcessor.DownloadAndValidateMessageContentAsync(eventMessage, cancellationToken); + var state = await nodeEventProcessor.GetMessageBatchStateAsync(eventMessage, tesTask, cancellationToken); + messages.Add((eventMessage, tesTask, state, token => nodeEventProcessor.MarkMessageProcessedAsync(eventMessage, token))); + } + catch (Exception ex) + { + logger.LogError(ex, @"Downloading and parsing event failed: {ErrorMessage}", ex.Message); - try - { - nodeEventProcessor.ValidateMessageMetadata(eventMessage); - eventMessage = await nodeEventProcessor.DownloadAndValidateMessageContentAsync(eventMessage, cancellationToken); - var state = await nodeEventProcessor.GetMessageBatchStateAsync(eventMessage, tesTask, cancellationToken); - messages.Add((eventMessage, tesTask, state)); - } - catch (Exception ex) - { - logger.LogError(ex, @"Downloading and parsing event failed: {ErrorMessage}", ex.Message); - messages.Add((eventMessage, tesTask, new(AzureBatchTaskState.TaskState.InfoUpdate, Warning: new List + messages.Add(( + eventMessage, + tesTask, + new(AzureBatchTaskState.TaskState.InfoUpdate, Warning: new List { "EventParsingFailed", - $"{ex.GetType().FullName}: {ex.Message}", - }))); + $"{ex.GetType().FullName}: {ex.Message}" + }), + (ex is System.Diagnostics.UnreachableException || ex is RunnerEventsProcessor.AssertException) + ? token => nodeEventProcessor.MarkMessageProcessedAsync(eventMessage, token) // Don't retry this event + : default)); // Retry this event. - if (ex is System.Diagnostics.UnreachableException || ex is RunnerEventsProcessor.AssertException) // Don't retry this event. - { - markEventsProcessedList.Add(token => nodeEventProcessor.MarkMessageProcessedAsync(eventMessage, token)); - } + return; + } - return; + // Helpers + async ValueTask GetTesTaskAsync(string id, string @event) + { + TesTask tesTask = default; + if (await repository.TryGetItemAsync(id, cancellationToken, task => tesTask = task) && tesTask is not null) + { + logger.LogDebug("Completing event '{TaskEvent}' for task {TesTask}.", @event, tesTask.Id); + return tesTask; } - - markEventsProcessedList.Add(token => nodeEventProcessor.MarkMessageProcessedAsync(eventMessage, token)); - - // Helpers - async ValueTask GetTesTaskAsync(string id, string @event) + else { - TesTask tesTask = default; - if (await repository.TryGetItemAsync(id, cancellationToken, task => tesTask = task) && tesTask is not null) - { - logger.LogDebug("Completing event '{TaskEvent}' for task {TesTask}.", @event, tesTask.Id); - return tesTask; - } - else - { - logger.LogDebug("Could not find task {TesTask} for event '{TaskEvent}'.", id, @event); - return null; - } + logger.LogDebug("Could not find task {TesTask} for event '{TaskEvent}'.", id, @event); + return null; } - }); + } + }); - getEventsInOrder = () => nodeEventProcessor.OrderProcessedByExecutorSequence(messages, item => item.Message).Select(item => (item.Task, item.State)); - } + return nodeEventProcessor.OrderProcessedByExecutorSequence(messages, @event => @event.Message).Select(@event => (@event.Task, @event.State, @event.MarkProcessedAsync)); + } - // Ensure the IEnumerable is only enumerated one time. - var orderedMessageList = getEventsInOrder().ToList(); + /// + /// Updates each task based on the provided states. + /// + /// A collection of associated s, s, and a method to mark the source event processed. + /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// + async ValueTask UpdateTesTasksFromAvailableEventsAsync(IEnumerable<(TesTask Task, AzureBatchTaskState State, Func MarkProcessedAsync)> eventStates, CancellationToken cancellationToken) + { + eventStates = eventStates.ToList(); - if (!orderedMessageList.Any()) + if (!eventStates.Any()) { return; } @@ -209,29 +215,23 @@ async ValueTask GetTesTaskAsync(string id, string @event) await OrchestrateTesTasksOnBatchAsync( "NodeEvent", #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously - async _ => GetTesTasks(), + async _ => eventStates.Select(@event => @event.Task).ToAsyncEnumerable(), #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously - (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, orderedMessageList.Select(t => t.State).ToArray(), token), - stoppingToken, + (tesTasks, token) => batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, eventStates.Select(@event => @event.State).ToArray(), token), + cancellationToken, "events"); - await Parallel.ForEachAsync(markEventsProcessedList, stoppingToken, async (markEventProcessed, cancellationToken) => - { - try - { - await markEventProcessed(cancellationToken); - } - catch (Exception ex) - { - logger.LogError(ex, @"Failed to tag event processed."); - } - }); - - // Helpers - IAsyncEnumerable GetTesTasks() + await Parallel.ForEachAsync(eventStates.Select(@event => @event.MarkProcessedAsync).Where(func => func is not null), cancellationToken, async (markEventProcessed, cancellationToken) => { - return orderedMessageList.Select(t => t.Task).ToAsyncEnumerable(); - } + try + { + await markEventProcessed(cancellationToken); + } + catch (Exception ex) + { + logger.LogError(ex, @"Failed to tag event as processed."); + } + }); } } } diff --git a/src/TesApi.Web/appsettings.json b/src/TesApi.Web/appsettings.json index 4f5fc0b80..1d07cff35 100644 --- a/src/TesApi.Web/appsettings.json +++ b/src/TesApi.Web/appsettings.json @@ -5,9 +5,9 @@ "Microsoft": "Warning", "TesApi.Web.AzureProxy": "Debug", "TesApi.Web.BatchPool": "Debug", - "TesApi.Web.BatchPoolService": "Debug", "TesApi.Web.BatchScheduler": "Debug", - "TesApi.Web.Scheduler": "Debug", + "TesApi.Web.PoolScheduler": "Debug", + "TesApi.Web.TaskScheduler": "Debug", "Default": "Information" } }, From 654cc18e014ac901613a119d95f346ab5855dc35 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 13 Nov 2023 23:13:26 -0800 Subject: [PATCH 084/202] Address errors, reduce batch api calls --- src/TesApi.Tests/BatchPoolTests.cs | 1 - src/TesApi.Tests/BatchSchedulerTests.cs | 55 ++-- src/TesApi.Web/AzureBatchTaskState.cs | 10 +- src/TesApi.Web/BatchPool.cs | 252 ++++++------------ src/TesApi.Web/BatchPools.BatchScheduler.cs | 4 +- src/TesApi.Web/BatchScheduler.cs | 89 ++++--- .../Events/RunnerEventsProcessor.cs | 2 +- src/TesApi.Web/IBatchPool.cs | 22 +- src/TesApi.Web/IBatchScheduler.cs | 2 +- src/TesApi.Web/PoolScheduler.cs | 217 +++++++++++++-- 10 files changed, 377 insertions(+), 277 deletions(-) diff --git a/src/TesApi.Tests/BatchPoolTests.cs b/src/TesApi.Tests/BatchPoolTests.cs index bd3b91142..e6f722f40 100644 --- a/src/TesApi.Tests/BatchPoolTests.cs +++ b/src/TesApi.Tests/BatchPoolTests.cs @@ -4,7 +4,6 @@ using System; using System.Collections.Generic; using System.Linq; -using System.Threading; using System.Threading.Tasks; using Microsoft.Azure.Batch; using Microsoft.Azure.Management.Batch.Models; diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 2263b33d9..e8ab9f5fc 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -270,6 +270,7 @@ public async Task TesTaskFailsWithSystemErrorWhenTotalBatchQuotaIsSetTooLow() public async Task TesTaskFailsWhenBatchNodeDiskIsFull() { var tesTask = GetTesTask(); + tesTask.State = TesState.INITIALIZINGEnum; (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask, BatchTaskStates.NodeDiskFull); @@ -786,48 +787,38 @@ static async Task RunTest(string allowedVmSizes, TesState expectedTaskState, str [TestMethod] public async Task TaskStateTransitionsFromRunningState() { - //Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.TaskActive)); - Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.TaskPreparing)); + Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.TaskActive)); + Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.TaskPreparing)); Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.TaskRunning)); Assert.AreEqual(TesState.COMPLETEEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.TaskCompletedSuccessfully)); Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.TaskFailed)); - //Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.JobNotFound)); - //Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.TaskNotFound)); - //Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.MoreThanOneJobFound)); - Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.NodeDiskFull)); // TODO: Should be ExecutorError, but this currently falls into the bucket of NodeFailedDuringStartupOrExecution, which also covers StartTask failures, which are more correctly SystemError. - //Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.ActiveJobWithMissingAutoPool)); + Assert.AreEqual(TesState.CANCELEDEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.CancellationRequested)); + Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.NodeDiskFull)); + Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.UploadOrDownloadFailed)); Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.RUNNINGEnum, BatchTaskStates.NodePreempted)); } [TestMethod] public async Task TaskStateTransitionsFromInitializingState() { - //Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.TaskActive)); + Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.TaskActive)); Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.TaskPreparing)); Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.TaskRunning)); Assert.AreEqual(TesState.COMPLETEEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.TaskCompletedSuccessfully)); Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.TaskFailed)); - //Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.JobNotFound)); - //Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.TaskNotFound)); - //Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.MoreThanOneJobFound)); - Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.NodeDiskFull)); // TODO: Should be ExecutorError, but this currently falls into the bucket of NodeFailedDuringStartupOrExecution, which also covers StartTask failures, which are more correctly SystemError. + Assert.AreEqual(TesState.CANCELEDEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.CancellationRequested)); + Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.NodeDiskFull)); + Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.UploadOrDownloadFailed)); + Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.NodeStartTaskFailed)); Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.NodeAllocationFailed)); - //Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.ImageDownloadFailed)); - //Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.ActiveJobWithMissingAutoPool)); Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.INITIALIZINGEnum, BatchTaskStates.NodePreempted)); } [TestMethod] public async Task TaskStateTransitionsFromQueuedState() { - Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.TaskActive)); - //Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.TaskPreparing)); - Assert.AreEqual(TesState.RUNNINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.TaskRunning)); - Assert.AreEqual(TesState.COMPLETEEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.TaskCompletedSuccessfully)); - Assert.AreEqual(TesState.EXECUTORERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.TaskFailed)); - //Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.MoreThanOneJobFound)); - Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.NodeDiskFull)); // TODO: Should be ExecutorError, but this currently falls into the bucket of NodeFailedDuringStartupOrExecution, which also covers StartTask failures, which are more correctly SystemError. - //Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.TaskNotFound)); + Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum)); + Assert.AreEqual(TesState.CANCELEDEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.CancellationRequested)); } [TestMethod] @@ -883,10 +874,10 @@ public async Task TaskThatFailsWithNodeAllocationErrorIsRequeuedOnDifferentVmSiz [TestMethod] public async Task TaskGetsCancelled() { - var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELINGEnum }; + var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELINGEnum, Logs = new() { new() } }; var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - azureProxyReturnValues.BatchTaskState = BatchTaskStates.Terminated; + azureProxyReturnValues.BatchTaskState = BatchTaskStates.CancellationRequested; Mock azureProxy = default; var azureProxySetter = new Action>(mock => { @@ -943,6 +934,7 @@ public async Task TaskGetsCancelled() public async Task SuccessfullyCompletedTaskContainsBatchNodeMetrics() { var tesTask = GetTesTask(); + tesTask.State = TesState.INITIALIZINGEnum; var metricsFileContent = @" BlobXferPullStart=2020-10-08T02:30:39+00:00 @@ -994,6 +986,7 @@ public async Task SuccessfullyCompletedTaskContainsBatchNodeMetrics() public async Task SuccessfullyCompletedTaskContainsCromwellResultCode() { var tesTask = GetTesTask(); + tesTask.State = TesState.INITIALIZINGEnum; var azureProxyReturnValues = AzureProxyReturnValues.Defaults; azureProxyReturnValues.BatchTaskState = BatchTaskStates.TaskCompletedSuccessfully; @@ -1844,21 +1837,17 @@ internal static async ValueTask GuardAssertsWithTesTask(TesTask tesTask, Func default; + public static AzureBatchTaskState TaskActive => new(AzureBatchTaskState.TaskState.InfoUpdate); public static AzureBatchTaskState TaskPreparing => new(AzureBatchTaskState.TaskState.Initializing, CloudTaskCreationTime: DateTimeOffset.UtcNow); public static AzureBatchTaskState TaskRunning => new(AzureBatchTaskState.TaskState.Running, CloudTaskCreationTime: DateTimeOffset.UtcNow - TimeSpan.FromMinutes(6)); public static AzureBatchTaskState TaskCompletedSuccessfully => new(AzureBatchTaskState.TaskState.CompletedSuccessfully, BatchTaskExitCode: 0); public static AzureBatchTaskState TaskFailed => new(AzureBatchTaskState.TaskState.CompletedWithErrors, BatchTaskExitCode: -1); - //public static AzureBatchTaskState JobNotFound => new() { JobState = null }; - //public static AzureBatchTaskState TaskNotFound => new() { JobState = JobState.Active, TaskState = null }; - //public static AzureBatchTaskState MoreThanOneJobFound => new() { MoreThanOneActiveJobOrTaskFound = true }; + public static AzureBatchTaskState NodeDiskFull => new(AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, Failure: new("DiskFull", new[] { "Error message." })); + public static AzureBatchTaskState UploadOrDownloadFailed => new(AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed); public static AzureBatchTaskState NodeAllocationFailed => new(AzureBatchTaskState.TaskState.NodeAllocationFailed, Failure: new(AzureBatchTaskState.TaskState.NodeAllocationFailed.ToString(), new[] { "Error message." })); public static AzureBatchTaskState NodePreempted => new(AzureBatchTaskState.TaskState.NodePreempted); - public static AzureBatchTaskState NodeDiskFull => new(AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, Failure: new("DiskFull", new[] { "Error message." })); - - public static AzureBatchTaskState Terminated => new(AzureBatchTaskState.TaskState.CancellationRequested, CloudTaskCreationTime: DateTimeOffset.UtcNow - TimeSpan.FromMinutes(12)); - //public static AzureBatchTaskState ActiveJobWithMissingAutoPool => new() { ActiveJobWithMissingAutoPool = true }; - //public static AzureBatchTaskState ImageDownloadFailed => new() { JobState = JobState.Active, NodeErrorCode = "ContainerInvalidImage" }; + public static AzureBatchTaskState NodeStartTaskFailed => new(AzureBatchTaskState.TaskState.NodeStartTaskFailed); + public static AzureBatchTaskState CancellationRequested => new(AzureBatchTaskState.TaskState.CancellationRequested, CloudTaskCreationTime: DateTimeOffset.UtcNow - TimeSpan.FromMinutes(12)); } private class AzureProxyReturnValues diff --git a/src/TesApi.Web/AzureBatchTaskState.cs b/src/TesApi.Web/AzureBatchTaskState.cs index 74d9c916d..da9946a1b 100644 --- a/src/TesApi.Web/AzureBatchTaskState.cs +++ b/src/TesApi.Web/AzureBatchTaskState.cs @@ -44,7 +44,7 @@ public enum TaskState /// /// The event does not represent any change in the task's state. /// - NoChange, + NoChange = 1, /// /// The event provides task state without changing the task's . @@ -92,9 +92,9 @@ public enum TaskState NodePreempted, /// - /// node in an Unusable state detected + /// Node failed during the startup task /// - NodeUnusable, + NodeStartTaskFailed, /// /// Node failed during startup or task execution (for example, ContainerInvalidImage, DiskFull) @@ -125,8 +125,8 @@ public record struct FailureInformation(string Reason, IEnumerable Syste /// /// SystemLog-appending copy constructor /// - /// - /// + /// to copy + /// Text to add to the SystemLog in the copy protected AzureBatchTaskState(AzureBatchTaskState original, string appendToSystemLog) : this(original) { diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index e74f487ea..0f3bf46ef 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -54,15 +54,9 @@ public BatchPool(IBatchScheduler batchScheduler, IOptions StartTaskFailures { get; } = new(); - private Queue ResizeErrors { get; } = new(); - private IAsyncEnumerable GetTasksAsync(string select, string filter) => _removedFromService ? AsyncEnumerable.Empty() : _azureProxy.ListTasksAsync(Id, new ODATADetailLevel { SelectClause = select, FilterClause = filter }); - internal IAsyncEnumerable GetTasksAsync(bool includeCompleted) - => GetTasksAsync("id,stateTransitionTime", includeCompleted ? default : "state ne 'completed'"); - private async ValueTask RemoveNodesAsync(IList nodesToRemove, CancellationToken cancellationToken) { _logger.LogDebug("Removing {Nodes} nodes from {PoolId}", nodesToRemove.Count, Id); @@ -380,7 +374,7 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc case ScalingMode.RemovingFailedNodes: _scalingMode = ScalingMode.RemovingFailedNodes; _logger.LogInformation(@"Switching pool {PoolId} back to autoscale.", Id); - await _azureProxy.EnableBatchPoolAutoScaleAsync(Id, !IsDedicated, AutoScaleEvaluationInterval, AutoPoolFormula, GetTaskCountAsync, cancellationToken); + await _azureProxy.EnableBatchPoolAutoScaleAsync(Id, !IsDedicated, AutoScaleEvaluationInterval, AutoPoolFormula, _ => ValueTask.FromResult(GetTasks(includeCompleted: false).Count()), cancellationToken); _autoScaleWaitTime = DateTime.UtcNow + (3 * AutoScaleEvaluationInterval) + (PoolScheduler.RunInterval / 2); _scalingMode = _resetAutoScalingRequired ? ScalingMode.WaitingForAutoScale : ScalingMode.SettingAutoScale; _resetAutoScalingRequired = false; @@ -398,22 +392,7 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc _logger.LogInformation(@"Pool {PoolId} is back to normal resize and monitoring status.", Id); break; } - - async ValueTask GetTaskCountAsync(int @default) // Used to make reenabling auto-scale more performant by attempting to gather the current number of "pending" tasks, falling back on the current target. - { - try - { - return await GetTasksAsync(includeCompleted: false).CountAsync(cancellationToken); - } - catch - { - return @default; - } - } } - - IAsyncEnumerable GetNodesToRemove(bool withState) - => _azureProxy.ListComputeNodesAsync(Id, new ODATADetailLevel(filterClause: @"state eq 'starttaskfailed' or state eq 'preempted' or state eq 'unusable'", selectClause: withState ? @"id,state,startTaskInfo" : @"id")); } private bool DetermineIsAvailable(DateTime? creation) @@ -436,10 +415,10 @@ private async ValueTask ServicePoolRemovePoolIfEmptyAsync(CancellationToken canc // Get current node counts var (_, _, _, _, lowPriorityNodes, _, dedicatedNodes) = await _azureProxy.GetFullAllocationStateAsync(Id, cancellationToken); - if (lowPriorityNodes.GetValueOrDefault(0) == 0 && dedicatedNodes.GetValueOrDefault(0) == 0 && !await GetTasksAsync(includeCompleted: true).AnyAsync(cancellationToken)) + if (lowPriorityNodes.GetValueOrDefault() == 0 && dedicatedNodes.GetValueOrDefault() == 0 && !GetTasks(includeCompleted: true).Any()) { _ = _batchPools.RemovePoolFromList(this); - await _batchPools.DeletePoolAsync(this, cancellationToken); + await _batchPools.DeletePoolAndJobAsync(this, cancellationToken); } } } @@ -485,6 +464,12 @@ internal enum ServiceKind /// public string Id { get; private set; } + /// + public Queue StartTaskFailures { get; } = new(); + + /// + public Queue ResizeErrors { get; } = new(); + /// public async ValueTask CanBeDeletedAsync(CancellationToken cancellationToken = default) { @@ -493,24 +478,24 @@ public async ValueTask CanBeDeletedAsync(CancellationToken cancellationTok return true; } - if (await GetTasksAsync(includeCompleted: true).AnyAsync(cancellationToken)) + if (await GetTasksAsync("id", default).AnyAsync(cancellationToken)) { return false; } - await foreach (var node in _azureProxy.ListComputeNodesAsync(Id, new ODATADetailLevel(selectClause: "state")).WithCancellation(cancellationToken)) - { - switch (node.State) - { - case ComputeNodeState.Rebooting: - case ComputeNodeState.Reimaging: - case ComputeNodeState.Running: - case ComputeNodeState.Creating: - case ComputeNodeState.Starting: - case ComputeNodeState.WaitingForStartTask: - return false; - } - } + //await foreach (var node in _azureProxy.ListComputeNodesAsync(Id, new ODATADetailLevel(selectClause: "state")).WithCancellation(cancellationToken)) + //{ + // switch (node.State) + // { + // case ComputeNodeState.Rebooting: + // case ComputeNodeState.Reimaging: + // case ComputeNodeState.Running: + // case ComputeNodeState.Creating: + // case ComputeNodeState.Starting: + // case ComputeNodeState.WaitingForStartTask: + // return false; + // } + //} return true; } @@ -549,9 +534,32 @@ internal async ValueTask ServicePoolAsync(ServiceKind serviceKind, CancellationT /// public async ValueTask ServicePoolAsync(CancellationToken cancellationToken) { + async ValueTask StandupQueries() + { + // List tasks from batch just one time each time we service the pool when called from PoolScheduler + _foundTasks = await GetTasksAsync("creationTime,executionInfo,id,nodeInfo,state,stateTransitionTime", null).ToListAsync(cancellationToken); + + // List nodes from Batch at most one time each time we service the pool + if (_foundTasks.Where(PoolScheduler.TaskListWithComputeNodeInfoPredicate).Any()) + { + var nodes = (await _azureProxy.ListComputeNodesAsync(Id, + new ODATADetailLevel(filterClause: @"state eq 'starttaskfailed' or state eq 'preempted' or state eq 'unusable'", selectClause: @"errors,id,state,startTaskInfo")) + .ToListAsync(cancellationToken)) + .ToAsyncEnumerable(); + _lazyComputeNodes = _ => new(nodes); + } + else + { + _lazyComputeNodes = withState => new(_azureProxy.ListComputeNodesAsync(Id, + new ODATADetailLevel(filterClause: @"state eq 'starttaskfailed' or state eq 'preempted' or state eq 'unusable'", selectClause: withState.Value ? @"id,state,startTaskInfo" : @"id"))); + } + } + var exceptions = new List(); - _ = await PerformTask(ServicePoolAsync(ServiceKind.GetResizeErrors, cancellationToken), cancellationToken) && + // Run each servicing task serially and accumulate the exception, except whenever the pool or the job are not found + _ = await PerformTask(StandupQueries(), cancellationToken) && + await PerformTask(ServicePoolAsync(ServiceKind.GetResizeErrors, cancellationToken), cancellationToken) && await PerformTask(ServicePoolAsync(ServiceKind.ManagePoolScaling, cancellationToken), cancellationToken) && await PerformTask(ServicePoolAsync(ServiceKind.Rotate, cancellationToken), cancellationToken) && await PerformTask(ServicePoolAsync(ServiceKind.RemovePoolIfEmpty, cancellationToken), cancellationToken); @@ -588,24 +596,26 @@ async ValueTask PerformTask(ValueTask serviceAction, CancellationToken can catch (Exception ex) { exceptions.Add(ex); - return await RemoveMissingPoolsAsync(ex, cancellationToken); + return !await RemoveMissingPoolsAsync(ex, cancellationToken); } } return false; } - // Returns false when pool/job was removed because it was not found. Returns true otherwise. + // Returns true when pool/job was removed because it was not found. Returns false otherwise. async ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken cancellationToken) { switch (ex) { case AggregateException aggregateException: - var result = true; + var result = false; + foreach (var e in aggregateException.InnerExceptions) { - result &= await RemoveMissingPoolsAsync(e, cancellationToken); + result |= await RemoveMissingPoolsAsync(e, cancellationToken); } + return result; case BatchException batchException: @@ -614,143 +624,37 @@ async ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken ca { _logger.LogError(ex, "Batch pool and/or job {PoolId} is missing. Removing them from TES's active pool list.", Id); _ = _batchPools.RemovePoolFromList(this); - await _batchPools.DeletePoolAsync(this, cancellationToken); - return false; + await _batchPools.DeletePoolAndJobAsync(this, cancellationToken); + return true; } + break; } - return true; + + return false; } } + private IEnumerable _foundTasks = Enumerable.Empty(); + + private Func>> _lazyComputeNodes; + + private IAsyncEnumerable GetNodesToRemove(bool withState) + => _lazyComputeNodes(withState).Value; + /// - public IAsyncEnumerable GetTasksToDelete(DateTime now, CancellationToken cancellationToken) + public IAsyncEnumerable ListLostComputeNodesAsync() { - now = now.ToUniversalTime(); - return GetTasksAsync("creationTime,id", $"state eq 'completed' and creationTime lt datetime'{now - TimeSpan.FromMinutes(10):O}' and stateTransitionTime gt datetime'{now + TimeSpan.FromMinutes(3)}'").Select(task => new IBatchScheduler.CloudTaskId(Id, task.Id, task.CreationTime.Value)); + return _lazyComputeNodes(null).Value.Where(node => !ComputeNodeState.StartTaskFailed.Equals(node.State)); } + private IEnumerable GetTasks(bool includeCompleted) + => _foundTasks.Where(task => includeCompleted || !TaskState.Completed.Equals(task.State)); + /// - public async IAsyncEnumerable GetCloudTaskStatesAsync(DateTime now, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + public IAsyncEnumerable ListCloudTasksAsync() { - now = now.ToUniversalTime(); - List taskListWithComputeNodeInfo; // To check if the task was running when its node became preempted or unusable - List activeTaskList; // These are candidates to be the victim of resizes or starttask failures - List completedTaskList; // Backstop if events don't provide timely task completion information in a timely manner - - { - var taskList = await GetTasksAsync("executionInfo,id,nodeInfo,state,stateTransitionTime", null).ToListAsync(cancellationToken); - taskListWithComputeNodeInfo = taskList.Where(task => !TaskState.Completed.Equals(task.State) && !string.IsNullOrEmpty(task.ComputeNodeInformation?.ComputeNodeId)).ToList(); - activeTaskList = taskList.Where(task => TaskState.Active.Equals(task.State)).OrderByDescending(task => task.StateTransitionTime).ToList(); - completedTaskList = taskList.Where(task => TaskState.Completed.Equals(task.State) && task.StateTransitionTime < now - TimeSpan.FromMinutes(2)).ToList(); - } - - await foreach (var node in _azureProxy.ListComputeNodesAsync(Id, new ODATADetailLevel(filterClause: @"state eq 'preempted' or state eq 'unusable'", selectClause: @"errors,id,state")).WithCancellation(cancellationToken)) - { - foreach (var task in taskListWithComputeNodeInfo.Where(task => node.Id.Equals(task.ComputeNodeInformation.ComputeNodeId, StringComparison.InvariantCultureIgnoreCase))) - { - yield return new(task.Id, node.State switch - { - ComputeNodeState.Preempted => new(AzureBatchTaskState.TaskState.NodePreempted), - ComputeNodeState.Unusable => new(AzureBatchTaskState.TaskState.NodeUnusable, Failure: ParseComputeNodeErrors(node.Errors)), - _ => throw new System.Diagnostics.UnreachableException(), - }); - - _ = activeTaskList.Remove(task); - } - } - - await foreach (var state in activeTaskList.ToAsyncEnumerable().Zip(GetFailures(cancellationToken), - (cloud, state) => new CloudTaskBatchTaskState(cloud.Id, state)) - .WithCancellation(cancellationToken)) - { - yield return state; - } - - foreach (var task in completedTaskList) - { - yield return new(task.Id, GetCompletedBatchState(task)); - } - - yield break; - - static AzureBatchTaskState.FailureInformation ParseComputeNodeErrors(IReadOnlyList nodeErrors) - { - var totalList = nodeErrors.Select(nodeError => Enumerable.Empty().Append(nodeError.Code).Append(nodeError.Message) - .Concat(nodeError.ErrorDetails.Select(errorDetail => Enumerable.Empty().Append(errorDetail.Name).Append(errorDetail.Value)).SelectMany(s => s))) - .SelectMany(s => s).ToList(); - - if (totalList.Contains(TaskFailureInformationCodes.DiskFull)) - { - return new(TaskFailureInformationCodes.DiskFull, totalList); - } - else - { - return new(BatchErrorCodeStrings.NodeStateUnusable, totalList); - } - } - -#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously - async IAsyncEnumerable GetFailures([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) -#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously - { - for (var failure = PopNextStartTaskFailure(); failure is not null; failure = PopNextStartTaskFailure()) - { - yield return ConvertFromStartTask(failure); - cancellationToken.ThrowIfCancellationRequested(); - } - - for (var failure = PopNextResizeError(); failure is not null; failure = PopNextResizeError()) - { - yield return ConvertFromResize(failure); - cancellationToken.ThrowIfCancellationRequested(); - } - - yield break; - } - - AzureBatchTaskState ConvertFromResize(ResizeError failure) - => new(AzureBatchTaskState.TaskState.NodeAllocationFailed, Failure: new(failure.Code, Enumerable.Empty() - .Append(failure.Message) - .Concat(failure.Values.Select(t => t.Value)))); - - AzureBatchTaskState ConvertFromStartTask(TaskFailureInformation failure) - => new(AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, Failure: new(failure.Code, Enumerable.Empty() - .Append(failure.Message) - .Append($"Start task failed ({failure.Category})") - .Concat(failure.Details.Select(t => t.Value)))); - - ResizeError PopNextResizeError() - => ResizeErrors.TryDequeue(out var resizeError) ? resizeError : default; - - TaskFailureInformation PopNextStartTaskFailure() - => StartTaskFailures.TryDequeue(out var failure) ? failure : default; - - AzureBatchTaskState GetCompletedBatchState(CloudTask task) - { - _logger.LogDebug("Getting batch task state from completed task {TesTask}.", _batchPools.GetTesTaskIdFromCloudTaskId(task.Id)); - return task.ExecutionInformation.Result switch - { - TaskExecutionResult.Success => new( - AzureBatchTaskState.TaskState.CompletedSuccessfully, - BatchTaskStartTime: task.ExecutionInformation.StartTime, - BatchTaskEndTime: task.ExecutionInformation.EndTime, - BatchTaskExitCode: task.ExecutionInformation.ExitCode), - - TaskExecutionResult.Failure => new( - AzureBatchTaskState.TaskState.CompletedWithErrors, - Failure: new(task.ExecutionInformation.FailureInformation.Code, - Enumerable.Empty() - .Append(task.ExecutionInformation.FailureInformation.Message) - .Append($"Batch task ExitCode: {task.ExecutionInformation?.ExitCode}, Failure message: {task.ExecutionInformation?.FailureInformation?.Message}") - .Concat(task.ExecutionInformation.FailureInformation.Details.Select(pair => pair.Value))), - BatchTaskStartTime: task.ExecutionInformation.StartTime, - BatchTaskEndTime: task.ExecutionInformation.EndTime, - BatchTaskExitCode: task.ExecutionInformation.ExitCode), - - _ => throw new System.Diagnostics.UnreachableException(), - }; - } + return _foundTasks.ToAsyncEnumerable(); } /// @@ -773,7 +677,7 @@ await Task.WhenAll( { var exception = ex.Flatten(); // If there is only one contained exception, we don't need an AggregateException, and we have a simple path to success (following this if block) - // In the extremely unlikely event that there are no innerexceptions, we don't want to change the existing code flow nor do we want to complicate the (less than 2) path. + // In the extremely unlikely event that there are no innerexceptions, we don't want to change the existing code flow nor do we want to complicate the (less than 2 inner exceptions) path. if (exception.InnerExceptions?.Count != 1) { throw new AggregateException(exception.Message, exception.InnerExceptions?.Select(HandleException) ?? Enumerable.Empty()); @@ -788,13 +692,13 @@ await Task.WhenAll( Exception HandleException(Exception ex) { - // When the batch management API creating the pool times out, it may or may not have created the pool. Add an inactive record to delete it if it did get created and try again later. That record will be removed later whether or not the pool was created. + // When the batch management API creating the pool times out, it may or may not have created the pool. Add an inactive record to delete it if it did get created and try again later. That record will be removed when servicing whether or not the pool was really created. Id ??= poolModel.Name; _ = _batchPools.AddPool(this); return ex switch { - OperationCanceledException => ex.InnerException is null ? ex : new AzureBatchPoolCreationException(ex.Message, true, ex), - var x when x is RequestFailedException rfe && rfe.Status == 0 && rfe.InnerException is System.Net.WebException webException && webException.Status == System.Net.WebExceptionStatus.Timeout => new AzureBatchPoolCreationException(ex.Message, true, ex), + OperationCanceledException => ex.InnerException is null ? ex : new AzureBatchPoolCreationException(ex.Message, isTimeout: true, ex), + var x when x is RequestFailedException rfe && rfe.Status == 0 && rfe.InnerException is System.Net.WebException webException && webException.Status == System.Net.WebExceptionStatus.Timeout => new AzureBatchPoolCreationException(ex.Message, isTimeout: true, ex), var x when IsInnermostExceptionSocketException(x) => new AzureBatchPoolCreationException(ex.Message, ex), _ => new AzureBatchPoolCreationException(ex.Message, ex), }; @@ -855,12 +759,10 @@ public void MarkRemovedFromService() } /// - /// Used for unit/module testing. + /// Used only for unit/module testing. /// public sealed partial class BatchPool { - internal int TestPendingReservationsCount => GetTasksAsync(includeCompleted: false).CountAsync().AsTask().Result; - internal int? TestTargetDedicated => _azureProxy.GetFullAllocationStateAsync(Id, CancellationToken.None).Result.TargetDedicated; internal int? TestTargetLowPriority => _azureProxy.GetFullAllocationStateAsync(Id, CancellationToken.None).Result.TargetLowPriority; diff --git a/src/TesApi.Web/BatchPools.BatchScheduler.cs b/src/TesApi.Web/BatchPools.BatchScheduler.cs index 8f09c05ec..31e7be03d 100644 --- a/src/TesApi.Web/BatchPools.BatchScheduler.cs +++ b/src/TesApi.Web/BatchPools.BatchScheduler.cs @@ -203,7 +203,7 @@ public async ValueTask FlushPoolsAsync(IEnumerable assignedPools, Cancel .Take(neededPools.Count) .WithCancellation(cancellationToken)) { - await DeletePoolAsync(pool, cancellationToken); + await DeletePoolAndJobAsync(pool, cancellationToken); _ = RemovePoolFromList(pool); } } @@ -214,7 +214,7 @@ public async ValueTask FlushPoolsAsync(IEnumerable assignedPools, Cancel } /// - public Task DeletePoolAsync(IBatchPool pool, CancellationToken cancellationToken) + public Task DeletePoolAndJobAsync(IBatchPool pool, CancellationToken cancellationToken) { // TODO: Consider moving any remaining tasks to another pool, or failing tasks explicitly logger.LogDebug(@"Deleting pool and job {PoolId}", pool.Id); diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 37c4ca0c9..0c2d1bdb3 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -172,31 +172,31 @@ public BatchScheduler( logger.LogInformation(@"usePreemptibleVmsOnly: {UsePreemptibleVmsOnly}", usePreemptibleVmsOnly); - static bool tesTaskIsQueuedInitializingOrRunning(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum || tesTask.State == TesState.INITIALIZINGEnum || tesTask.State == TesState.RUNNINGEnum; static bool tesTaskIsInitializingOrRunning(TesTask tesTask) => tesTask.State == TesState.INITIALIZINGEnum || tesTask.State == TesState.RUNNINGEnum; - static bool tesTaskIsQueuedOrInitializing(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum || tesTask.State == TesState.INITIALIZINGEnum; - static bool tesTaskIsQueued(TesTask tesTask) => tesTask.State == TesState.QUEUEDEnum; + static bool tesTaskIsInitializing(TesTask tesTask) => tesTask.State == TesState.INITIALIZINGEnum; var setTaskStateLock = new object(); async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { { - var newData = System.Text.Json.JsonSerializer.Serialize( - batchInfo, + var newData = new CombinedBatchTaskInfo(batchInfo, false); + if (newData.Failure is null) { newData.AlternateSystemLogItem = null; } + var newDataText = System.Text.Json.JsonSerializer.Serialize( + newData, new System.Text.Json.JsonSerializerOptions() { DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingDefault, Converters = { new System.Text.Json.Serialization.JsonStringEnumConverter(System.Text.Json.JsonNamingPolicy.CamelCase) } }); - if ("{}".Equals(newData) && newTaskState == tesTask.State) + if ("{}".Equals(newDataText) && newTaskState == tesTask.State) { logger.LogDebug(@"For task {TesTask} there's nothing to change.", tesTask.Id); return false; } - logger.LogDebug(@"Setting task {TesTask} with metadata {Metadata}.", tesTask.Id, newData); + logger.LogDebug(@"Setting task {TesTask} with metadata {Metadata}.", tesTask.Id, newDataText); } var (batchNodeMetrics, taskStartTime, taskEndTime, cromwellRcCode) = newTaskState == TesState.COMPLETEEnum @@ -317,14 +317,14 @@ Task AddSystemLogAndSetTaskSystemErrorAsync(TesTask tesTask, CombinedBatch bool HandlePreemptedNode(TesTask tesTask, CombinedBatchTaskInfo batchInfo) { - // TODO: Keep track of the number of times Azure Batch retried this task and fail it as preempted if it is too many times. Waiting on Cromwell to support preempted tasks to do this. + // TODO: Keep track of the number of times Azure Batch retried this task and terminate it as preempted if it is too many times. Are we waiting on Cromwell to support preempted tasks to do this? var oldLog = tesTask.GetOrAddTesTaskLog(); var newLog = tesTask.AddTesTaskLog(); - oldLog.Warning = "ComputeNode was preempted. The task will be automatically rescheduled."; + oldLog.Warning = "ComputeNode was preempted. The task was automatically rescheduled."; newLog.VirtualMachineInfo = oldLog.VirtualMachineInfo; newLog.StartTime = DateTimeOffset.UtcNow; tesTask.State = TesState.INITIALIZINGEnum; - logger.LogInformation("The TesTask {TesTask}'s node was preempted. It will be automatically rescheduled.", tesTask.Id); + logger.LogInformation("The TesTask {TesTask}'s node was preempted. It was automatically rescheduled.", tesTask.Id); return true; } @@ -336,19 +336,15 @@ Task HandleInfoUpdate(TesTask tesTask, CombinedBatchTaskInfo batchInfo, Ca tesTaskStateTransitions = new List() { new TesTaskStateTransition(condition: null, AzureBatchTaskState.TaskState.CancellationRequested, alternateSystemLogItem: null, TerminateBatchTaskAsync), - new TesTaskStateTransition(tesTaskIsQueued, AzureBatchTaskState.TaskState.Initializing, alternateSystemLogItem: null, (tesTask, _) => { tesTask.State = TesState.INITIALIZINGEnum; return true; }), - new TesTaskStateTransition(tesTaskIsQueuedOrInitializing, AzureBatchTaskState.TaskState.NodeAllocationFailed, alternateSystemLogItem: null, RequeueTaskAfterFailureAsync), - new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.Running, alternateSystemLogItem: null, (tesTask, info, ct) => SetTaskStateAndLog(tesTask, TesState.RUNNINGEnum, info, ct)), - //new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.MoreThanOneActiveJobOrTaskFound, BatchTaskState.MoreThanOneActiveJobOrTaskFound.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), - new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedSuccessfully, alternateSystemLogItem: null, SetTaskCompleted), - new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedWithErrors, "Please open an issue. There should have been an error reported here.", SetTaskExecutorError), - //new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.ActiveJobWithMissingAutoPool, alternateSystemLogItem: null, DeleteBatchJobAndRequeueTaskAsync), - new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, "Please open an issue. There should have been an error reported here.", SetTaskSystemError), - new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.NodeUnusable, "Please open an issue. There should have been an error reported here.", SetTaskExecutorError), - //new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.JobNotFound, BatchTaskState.JobNotFound.ToString(), SetTaskSystemError), - //new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.MissingBatchTask, BatchTaskState.MissingBatchTask.ToString(), DeleteBatchJobAndSetTaskSystemErrorAsync), + new TesTaskStateTransition(tesTaskIsInitializing, AzureBatchTaskState.TaskState.NodeAllocationFailed, alternateSystemLogItem: null, RequeueTaskAfterFailureAsync), + new TesTaskStateTransition(tesTaskIsInitializing, AzureBatchTaskState.TaskState.NodeStartTaskFailed, "Please open an issue. There should have been an error reported here.", SetTaskSystemError), + new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.Initializing, alternateSystemLogItem: null, (tesTask, info, ct) => SetTaskStateAndLog(tesTask, TesState.INITIALIZINGEnum, info, ct)), + new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.Running, alternateSystemLogItem: null, (tesTask, info, ct) => SetTaskStateAndLog(tesTask, TesState.RUNNINGEnum, info, ct)), + new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedSuccessfully, alternateSystemLogItem: null, SetTaskCompleted), + new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedWithErrors, "Please open an issue. There should have been an error reported here.", SetTaskExecutorError), + new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, "Please open an issue. There should have been an error reported here.", SetTaskSystemError), new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodePreempted, alternateSystemLogItem: null, HandlePreemptedNode), - new TesTaskStateTransition(tesTaskIsQueuedInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed, alternateSystemLogItem: null, HandleInfoUpdate), + new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed, alternateSystemLogItem: null, HandleInfoUpdate), new TesTaskStateTransition(condition: null, AzureBatchTaskState.TaskState.InfoUpdate, alternateSystemLogItem: null, HandleInfoUpdate), }.AsReadOnly(); } @@ -383,7 +379,7 @@ private async Task TerminateBatchTaskAsync(TesTask tesTask, CombinedBatchT { case AzureBatchTaskState.TaskState.CompletedSuccessfully: case AzureBatchTaskState.TaskState.CompletedWithErrors: - return false; // Let it finish on its own + return false; // It's already finished case AzureBatchTaskState.TaskState.CancellationRequested: @@ -393,14 +389,18 @@ private async Task TerminateBatchTaskAsync(TesTask tesTask, CombinedBatchT } tesTask.State = TesState.CANCELEDEnum; + + if (!(tesTask.Logs?.Any() ?? false)) + { + return true; // It was never scheduled + } + goto default; default: - break; + await azureProxy.TerminateBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); + break;//return true; } - - await azureProxy.TerminateBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); - //return true; } catch (BatchException exc) when (BatchErrorCodeStrings.TaskNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { @@ -412,7 +412,7 @@ private async Task TerminateBatchTaskAsync(TesTask tesTask, CombinedBatchT throw; } - try + try // TODO: remove (and undo changes to taskExecutionScriptingManager) { await taskExecutionScriptingManager.TryUploadServerTesTask(tesTask, "server-tes-task-completed.json", cancellationToken); } @@ -1410,13 +1410,13 @@ static bool TryGetValueAsDouble(Dictionary dict, string key, out } catch (Exception ex) { - logger.LogError($"Failed to parse metrics for task {tesTask.Id}. Error: {ex.Message}"); + logger.LogError(@"Failed to parse metrics for task {TesTask}. Error: {ExceptionMessage}", tesTask.Id, ex.Message); } } } catch (Exception ex) { - logger.LogError($"Failed to get batch node metrics for task {tesTask.Id}. Error: {ex.Message}"); + logger.LogError(@"Failed to get batch node metrics for task {TesTask}. Error: {ExceptionMessage}", tesTask.Id, ex.Message); } return (batchNodeMetrics, taskStartTime, taskEndTime, cromwellRcCode); @@ -1524,19 +1524,40 @@ public async ValueTask ActionAsync(TesTask tesTask, AzureBatchTaskState ba private record CombinedBatchTaskInfo : AzureBatchTaskState { - public CombinedBatchTaskInfo(CombinedBatchTaskInfo state, string additionalSystemLogItem) - : base(state, additionalSystemLogItem) + /// + /// Copy constructor that defaults (to enable hiding if serialized) + /// + /// to copy + /// Parameter that exists to not override the default copy constructor + public CombinedBatchTaskInfo(CombinedBatchTaskInfo original, bool _1) + : this(original) + { + State = default; + } + + /// + /// SystemLog-appending copy constructor + /// + /// to copy + /// Text to add to the SystemLog in the copy + public CombinedBatchTaskInfo(CombinedBatchTaskInfo original, string additionalSystemLogItem) + : base(original, additionalSystemLogItem) { - AlternateSystemLogItem = state.AlternateSystemLogItem; + AlternateSystemLogItem = original.AlternateSystemLogItem; // reattach this property } + /// + /// Constructor + /// + /// to extend + /// from the selected Action public CombinedBatchTaskInfo(AzureBatchTaskState state, string alternateSystemLogItem) : base(state) { AlternateSystemLogItem = alternateSystemLogItem; } - public string AlternateSystemLogItem { get; } + public string AlternateSystemLogItem { get; set; } } } } diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index 251320f81..58b3a1da0 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -238,7 +238,7 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes var state = (nodeMessage.Name ?? message.Event) switch { - Tes.Runner.Events.EventsPublisher.DownloadStartEvent => new AzureBatchTaskState(AzureBatchTaskState.TaskState.InfoUpdate, + Tes.Runner.Events.EventsPublisher.DownloadStartEvent => new AzureBatchTaskState(AzureBatchTaskState.TaskState.Initializing, BatchTaskStartTime: nodeMessage.Created), Tes.Runner.Events.EventsPublisher.DownloadEndEvent => nodeMessage.StatusMessage switch diff --git a/src/TesApi.Web/IBatchPool.cs b/src/TesApi.Web/IBatchPool.cs index 864ba17a3..5d996b1c3 100644 --- a/src/TesApi.Web/IBatchPool.cs +++ b/src/TesApi.Web/IBatchPool.cs @@ -24,6 +24,16 @@ public interface IBatchPool /// string Id { get; } + /// + /// Failures from nodes in . + /// + Queue StartTaskFailures { get; } + + /// + /// Pool allocation failures that impact task execution ability to be successful. + /// + Queue ResizeErrors { get; } + /// /// Creates an Azure Batch pool and associated job in the Batch Account. /// @@ -59,20 +69,16 @@ public interface IBatchPool ValueTask ServicePoolAsync(CancellationToken cancellationToken = default); /// - /// Gets compute node related task state information. + /// Lists s running in pool's job. /// - /// Reference time. /// - /// - IAsyncEnumerable GetCloudTaskStatesAsync(DateTime now, CancellationToken cancellationToken = default); + IAsyncEnumerable ListCloudTasksAsync(); /// - /// Gets batch tasks that need to be deleted. + /// Lists s that are or . /// - /// Reference time. /// - /// - IAsyncEnumerable GetTasksToDelete(DateTime now, CancellationToken cancellationToken = default); + IAsyncEnumerable ListLostComputeNodesAsync(); /// /// Gets the last time the pool's compute node list was changed. diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index a5ef55a08..3dd12ef10 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -72,7 +72,7 @@ public interface IBatchScheduler /// /// A for controlling the lifetime of the asynchronous operation. /// - Task DeletePoolAsync(IBatchPool pool, CancellationToken cancellationToken); + Task DeletePoolAndJobAsync(IBatchPool pool, CancellationToken cancellationToken); /// /// Removes pool from list of managed pools. diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index af748b474..a77d0632e 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -2,10 +2,13 @@ // Licensed under the MIT License. using System; +using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Threading; using System.Threading.Tasks; +using Microsoft.Azure.Batch.Common; +using Microsoft.Azure.Batch; using Microsoft.Extensions.Logging; using Tes.Models; using Tes.Repository; @@ -90,22 +93,70 @@ await Parallel.ForEachAsync(pools, stoppingToken, async (pool, token) => /// private Task ServiceBatchPoolsAsync(CancellationToken stoppingToken) { - return ExecuteActionOnIntervalAsync(RunInterval, async cancellationToken => - { + return ExecuteActionOnIntervalAsync( + RunInterval, + async token => await ExecuteActionOnPoolsAsync( "Service Batch Pools", - async (pool, token) => + async (pool, cancellationToken) => { - var now = DateTime.UtcNow; - await pool.ServicePoolAsync(token); - await ProcessCloudTaskStatesAsync(pool.Id, pool.GetCloudTaskStatesAsync(now, token), token); - await ProcessDeletedTasks(pool.GetTasksToDelete(now, token), token); + await pool.ServicePoolAsync(cancellationToken); + await ProcessTasksAsync(pool, DateTime.UtcNow, pool.ListCloudTasksAsync(), cancellationToken); }, - cancellationToken); - }, - stoppingToken); + token), + stoppingToken); + } + + /// + /// Processes tasks to manage state. + /// + /// + /// Reference time. + /// + /// + /// + private async ValueTask ProcessTasksAsync(IBatchPool pool, DateTime now, IAsyncEnumerable tasks, CancellationToken cancellationToken) + { + var batchStateCandidateTasks = AsyncEnumerable.Empty(); + var deletionCandidateTasks = AsyncEnumerable.Empty(); + + var deletionCandidateCreationCutoff = now - TimeSpan.FromMinutes(10); + var stateTransitionTimeCutoff = now + TimeSpan.FromMinutes(3); + + await foreach (var task in tasks.WithCancellation(cancellationToken)) + { + + if (TaskListWithComputeNodeInfoPredicate(task) || ActiveTaskListPredicate(task) || CompletedTaskListPredicate(task, now)) + { + batchStateCandidateTasks = batchStateCandidateTasks.Append(task); + } + + if (TaskState.Completed.Equals(task.State) && task.CreationTime < deletionCandidateCreationCutoff && task.StateTransitionTime > stateTransitionTimeCutoff) + { + deletionCandidateTasks = deletionCandidateTasks.Append(new IBatchScheduler.CloudTaskId(pool.Id, task.Id, task.CreationTime.Value)); + } + } + + await ProcessCloudTaskStatesAsync(pool.Id, GetCloudTaskStatesAsync(pool, now, batchStateCandidateTasks, cancellationToken), cancellationToken); + + await ProcessDeletedTasks(deletionCandidateTasks, cancellationToken); } + /// + /// Shared between and . + /// + internal static bool TaskListWithComputeNodeInfoPredicate(CloudTask task) => !TaskState.Completed.Equals(task.State) && !string.IsNullOrEmpty(task.ComputeNodeInformation?.ComputeNodeId); + + /// + /// Shared between and . + /// + private static bool ActiveTaskListPredicate(CloudTask task) => !TaskState.Active.Equals(task.State); + + /// + /// Shared between and . + /// + private static bool CompletedTaskListPredicate(CloudTask task, DateTime now) => !TaskState.Completed.Equals(task.State) && task.StateTransitionTime < now - TimeSpan.FromMinutes(2); + /// /// Updates each task based on the provided states. /// @@ -115,22 +166,22 @@ await ExecuteActionOnPoolsAsync( /// private async ValueTask ProcessCloudTaskStatesAsync(string poolId, IAsyncEnumerable states, CancellationToken cancellationToken) { - var list = new List<(TesTask TesTask, AzureBatchTaskState State)>(); + var list = new ConcurrentBag<(TesTask TesTask, AzureBatchTaskState State)>(); - await foreach (var (cloudTaskId, state) in states.WithCancellation(cancellationToken)) + await Parallel.ForEachAsync(states, cancellationToken, async (state, token) => { TesTask tesTask = default; - if (await repository.TryGetItemAsync(batchScheduler.GetTesTaskIdFromCloudTaskId(cloudTaskId), cancellationToken, task => tesTask = task) && tesTask is not null) + if (await repository.TryGetItemAsync(batchScheduler.GetTesTaskIdFromCloudTaskId(state.CloudTaskId), token, task => tesTask = task) && tesTask is not null) { - list.Add((tesTask, state)); + list.Add((tesTask, state.TaskState)); } else { - logger.LogDebug(@"Unable to locate TesTask for CloudTask '{CloudTask}' with action state {ActionState}.", cloudTaskId, state.State); + logger.LogError(@"Unable to locate TesTask for CloudTask '{CloudTask}' with action state {ActionState}.", state.CloudTaskId, state.TaskState.State); } - } + }); - if (list.Count != 0) + if (!list.IsEmpty) { await OrchestrateTesTasksOnBatchAsync( $"NodeState ({poolId})", @@ -175,5 +226,137 @@ private async ValueTask ProcessDeletedTasks(IAsyncEnumerable + /// Obtains for tasks in specific conditions. + /// + /// + /// + /// + /// + /// + private async IAsyncEnumerable GetCloudTaskStatesAsync(IBatchPool pool, DateTime now, IAsyncEnumerable tasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + { + List taskListWithComputeNodeInfo; // To check if the task was running when its node became preempted or unusable + List activeTaskList; // These are candidates to be the victim of resizes or starttask failures + List completedTaskList; // Backstop if events don't provide timely task completion information in a timely manner + + { + var taskList = await tasks.ToListAsync(cancellationToken); + taskListWithComputeNodeInfo = taskList.Where(TaskListWithComputeNodeInfoPredicate).ToList(); + activeTaskList = taskList.Where(ActiveTaskListPredicate).OrderByDescending(task => task.StateTransitionTime).ToList(); + completedTaskList = taskList.Where(task => CompletedTaskListPredicate(task, now)).ToList(); + } + + if (taskListWithComputeNodeInfo.Count > 0) + { + await foreach (var node in pool.ListLostComputeNodesAsync().WithCancellation(cancellationToken)) + { + foreach (var task in taskListWithComputeNodeInfo.Where(task => node.Id.Equals(task.ComputeNodeInformation.ComputeNodeId, StringComparison.InvariantCultureIgnoreCase))) + { + yield return new(task.Id, node.State switch + { + ComputeNodeState.Preempted => new(AzureBatchTaskState.TaskState.NodePreempted), + ComputeNodeState.Unusable => new(AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, Failure: ParseComputeNodeErrors(node.Errors)), + _ => throw new System.Diagnostics.UnreachableException(), + }); + + _ = activeTaskList.Remove(task); + } + } + } + + await foreach (var state in activeTaskList.ToAsyncEnumerable().Zip(GetFailures(cancellationToken), + (cloud, state) => new CloudTaskBatchTaskState(cloud.Id, state)) + .WithCancellation(cancellationToken)) + { + yield return state; + } + + foreach (var task in completedTaskList) + { + yield return new(task.Id, GetCompletedBatchState(task)); + } + + yield break; + + static AzureBatchTaskState.FailureInformation ParseComputeNodeErrors(IReadOnlyList nodeErrors) + { + var totalList = nodeErrors.Select(nodeError => Enumerable.Empty().Append(nodeError.Code).Append(nodeError.Message) + .Concat(nodeError.ErrorDetails.Select(errorDetail => Enumerable.Empty().Append(errorDetail.Name).Append(errorDetail.Value)).SelectMany(s => s))) + .SelectMany(s => s).ToList(); + + if (totalList.Contains(TaskFailureInformationCodes.DiskFull)) + { + return new(TaskFailureInformationCodes.DiskFull, totalList); + } + else + { + return new(BatchErrorCodeStrings.NodeStateUnusable, totalList); + } + } + +#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously + async IAsyncEnumerable GetFailures([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) +#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + { + for (var failure = PopNextStartTaskFailure(); failure is not null; failure = PopNextStartTaskFailure()) + { + yield return ConvertFromStartTask(failure); + cancellationToken.ThrowIfCancellationRequested(); + } + + for (var failure = PopNextResizeError(); failure is not null; failure = PopNextResizeError()) + { + yield return ConvertFromResize(failure); + cancellationToken.ThrowIfCancellationRequested(); + } + + yield break; + } + + AzureBatchTaskState ConvertFromResize(ResizeError failure) + => new(AzureBatchTaskState.TaskState.NodeAllocationFailed, Failure: new(failure.Code, Enumerable.Empty() + .Append(failure.Message) + .Concat(failure.Values.Select(t => t.Value)))); + + AzureBatchTaskState ConvertFromStartTask(TaskFailureInformation failure) + => new(AzureBatchTaskState.TaskState.NodeStartTaskFailed, Failure: new(failure.Code, Enumerable.Empty() + .Append(failure.Message) + .Append($"Start task failed ({failure.Category})") + .Concat(failure.Details.Select(t => t.Value)))); + + ResizeError PopNextResizeError() + => pool.ResizeErrors.TryDequeue(out var resizeError) ? resizeError : default; + + TaskFailureInformation PopNextStartTaskFailure() + => pool.StartTaskFailures.TryDequeue(out var failure) ? failure : default; + + AzureBatchTaskState GetCompletedBatchState(CloudTask task) + { + logger.LogDebug("Getting batch task state from completed task {TesTask}.", batchScheduler.GetTesTaskIdFromCloudTaskId(task.Id)); + return task.ExecutionInformation.Result switch + { + TaskExecutionResult.Success => new( + AzureBatchTaskState.TaskState.CompletedSuccessfully, + BatchTaskStartTime: task.ExecutionInformation.StartTime, + BatchTaskEndTime: task.ExecutionInformation.EndTime, + BatchTaskExitCode: task.ExecutionInformation.ExitCode), + + TaskExecutionResult.Failure => new( + AzureBatchTaskState.TaskState.CompletedWithErrors, + Failure: new(task.ExecutionInformation.FailureInformation.Code, + Enumerable.Empty() + .Append(task.ExecutionInformation.FailureInformation.Message) + .Append($"Batch task ExitCode: {task.ExecutionInformation?.ExitCode}, Failure message: {task.ExecutionInformation?.FailureInformation?.Message}") + .Concat(task.ExecutionInformation.FailureInformation.Details.Select(pair => pair.Value))), + BatchTaskStartTime: task.ExecutionInformation.StartTime, + BatchTaskEndTime: task.ExecutionInformation.EndTime, + BatchTaskExitCode: task.ExecutionInformation.ExitCode), + + _ => throw new System.Diagnostics.UnreachableException(), + }; + } + } } } From 8293f37a91951c4e64f764176baa2e363b234ff2 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 14 Nov 2023 01:17:01 -0800 Subject: [PATCH 085/202] Address feedback --- .../TerraLandingZoneApiClientTest.cs | 8 +- .../TerraWsmApiClientTests.cs | 14 +- src/Tes.ApiClients/CachingRetryHandler.cs | 19 +- src/Tes.ApiClients/HttpApiClient.cs | 29 +--- src/Tes.ApiClients/RetryHandler.cs | 80 +++++++-- src/Tes.ApiClients/TerraWsmApiClient.cs | 4 +- .../ContainerRegistryProviderTests.cs | 6 +- src/TesApi.Web/AzureProxy.cs | 16 +- .../CachingWithRetriesAzureProxy.cs | 164 +++++++----------- .../Extensions/PagedInterfaceExtensions.cs | 8 +- .../Management/ContainerRegistryProvider.cs | 6 +- 11 files changed, 167 insertions(+), 187 deletions(-) diff --git a/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs b/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs index 7e975ec6d..b7c23b052 100644 --- a/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs +++ b/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs @@ -33,11 +33,11 @@ public async Task GetResourceQuotaAsync_ValidResourceIdReturnsQuotaInformationAn var body = terraApiStubData.GetResourceQuotaApiResponseInJson(); cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync( - It.IsAny>>(), It.IsAny(), It.IsAny())) + It.IsAny>>(), It.IsAny(), It.IsAny>())) .ReturnsAsync(new HttpResponseMessage()); cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync( - It.IsAny>>(), It.IsAny(), It.IsAny())) + It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(body); var quota = await terraLandingZoneApiClient.GetResourceQuotaAsync(terraApiStubData.LandingZoneId, terraApiStubData.BatchAccountId, cacheResults: true, cancellationToken: CancellationToken.None); @@ -64,11 +64,11 @@ public async Task GetLandingZoneResourcesAsync_ListOfLandingZoneResourcesAndGets var body = terraApiStubData.GetResourceApiResponseInJson(); cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync( - It.IsAny>>(), It.IsAny(), It.IsAny())) + It.IsAny>>(), It.IsAny(), It.IsAny>())) .ReturnsAsync(new HttpResponseMessage()); cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync( - It.IsAny>>(), It.IsAny(), It.IsAny())) + It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(body); var resources = await terraLandingZoneApiClient.GetLandingZoneResourcesAsync(terraApiStubData.LandingZoneId, CancellationToken.None); diff --git a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs index 3e6bf460e..2d4b62a09 100644 --- a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs +++ b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs @@ -92,11 +92,11 @@ public async Task GetSasTokenAsync_ValidRequest_ReturnsPayload() Content = new StringContent(terraApiStubData.GetWsmSasTokenApiResponseInJson()) }; - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny>())) .ReturnsAsync(response); - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) - .Returns((Func> action, CancellationToken cancellationToken, Polly.Context _2) => action(cancellationToken)); + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + .Returns((Func> action, CancellationToken cancellationToken, RetryHandler.OnRetryHandler _2) => action(cancellationToken)); var apiResponse = await terraWsmApiClient.GetSasTokenAsync(terraApiStubData.WorkspaceId, terraApiStubData.ContainerResourceId, null!, CancellationToken.None); @@ -114,11 +114,11 @@ public async Task GetContainerResourcesAsync_ValidRequest_ReturnsPayload() Content = new StringContent(terraApiStubData.GetContainerResourcesApiResponseInJson()) }; - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny>())) .ReturnsAsync(response); - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) - .Returns((Func> action, CancellationToken cancellationToken, Polly.Context _2) => action(cancellationToken)); + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + .Returns((Func> action, CancellationToken cancellationToken, RetryHandler.OnRetryHandler _2) => action(cancellationToken)); var apiResponse = await terraWsmApiClient.GetContainerResourcesAsync(terraApiStubData.WorkspaceId, offset: 0, limit: 10, CancellationToken.None); @@ -134,7 +134,7 @@ public async Task DeleteBatchPoolAsync_204Response_Succeeds() var wsmResourceId = Guid.NewGuid(); var response = new HttpResponseMessage(HttpStatusCode.NoContent); - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny>())) .ReturnsAsync(response); await terraWsmApiClient.DeleteBatchPoolAsync(terraApiStubData.WorkspaceId, wsmResourceId, CancellationToken.None); diff --git a/src/Tes.ApiClients/CachingRetryHandler.cs b/src/Tes.ApiClients/CachingRetryHandler.cs index 5bc94e775..e2cb2602f 100644 --- a/src/Tes.ApiClients/CachingRetryHandler.cs +++ b/src/Tes.ApiClients/CachingRetryHandler.cs @@ -3,7 +3,6 @@ using Microsoft.Extensions.Caching.Memory; using Microsoft.Extensions.Options; -using Polly; using Tes.ApiClients.Options; namespace Tes.ApiClients @@ -44,13 +43,13 @@ protected CachingRetryHandler() { } /// /// Action to execute /// A for controlling the lifetime of the asynchronous operation. - /// + /// to use. Defaults to none. /// - public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, CancellationToken cancellationToken, Context? context = default) + public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, CancellationToken cancellationToken, OnRetryHandler onRetry = default) { ValidateArgs(cacheKey, action); - return await ExecuteWithCacheAsync(cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken, context)); + return await ExecuteWithCacheAsync(cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken, onRetry)); } /// @@ -60,14 +59,14 @@ public virtual async Task ExecuteWithRetryAndCachingAsync(stri /// Action to execute /// /// A for controlling the lifetime of the asynchronous operation. - /// + /// to use. Defaults to none. /// /// - public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, DateTimeOffset cachesExpires, CancellationToken cancellationToken, Context? context = default) + public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, DateTimeOffset cachesExpires, CancellationToken cancellationToken, OnRetryHandler onRetry = default) { ValidateArgs(cacheKey, action); - return await ExecuteWithCacheAsync(cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken, context), cachesExpires); + return await ExecuteWithCacheAsync(cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken, onRetry), cachesExpires); } private static void ValidateArgs(string cacheKey, Func action) @@ -81,13 +80,13 @@ private static void ValidateArgs(string cacheKey, Func } private async Task ExecuteWithCacheAsync(string cacheKey, Func> action) - => (await appCache.GetOrCreateAsync(cacheKey, _ => action()))!; + => await appCache.GetOrCreateAsync(cacheKey, _ => action()); private async Task ExecuteWithCacheAsync(string cacheKey, Func> action, DateTimeOffset cacheExpires) - => (await appCache.GetOrCreateAsync(cacheKey, entry => + => await appCache.GetOrCreateAsync(cacheKey, entry => { entry.AbsoluteExpiration = cacheExpires; return action(); - }))!; + }); } } diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index 53eeb4bec..1d3a9cdd2 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -69,23 +69,12 @@ protected HttpApiClient(TokenCredential tokenCredential, string tokenScope, /// protected HttpApiClient() { } - /// - /// A logging Polly retry handler. - /// - /// Calling method name. - /// - private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string? caller = default) - => new((exception, timeSpan, retryCount, correlationId) => - { - Logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan} CorrelationId: {CorrelationId}", caller, retryCount, timeSpan, correlationId.ToString("D")); - }); - /// /// A logging Polly retry handler. /// /// Calling method name. /// - private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHttpResponseMessageHandler([System.Runtime.CompilerServices.CallerMemberName] string? caller = default) + private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHttpResponseMessageHandler([System.Runtime.CompilerServices.CallerMemberName] string caller = default) => new((result, timeSpan, retryCount, correlationId) => { if (result.Exception is null) @@ -111,8 +100,6 @@ private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHtt protected async Task HttpSendRequestWithRetryPolicyAsync( Func httpRequestFactory, CancellationToken cancellationToken, bool setAuthorizationHeader = false) { - var ctx = new Polly.Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHttpResponseMessageHandler()); return await cachingRetryHandler.ExecuteWithRetryAsync(async ct => { var request = httpRequestFactory(); @@ -122,7 +109,7 @@ protected async Task HttpSendRequestWithRetryPolicyAsync( } return await HttpClient.SendAsync(request, ct); - }, cancellationToken, ctx); + }, cancellationToken, LogRetryErrorOnRetryHttpResponseMessageHandler()); } /// @@ -186,12 +173,8 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri re protected async Task HttpGetRequestWithRetryPolicyAsync(Uri requestUrl, CancellationToken cancellationToken, bool setAuthorizationHeader = false) { - var context = new Polly.Context(); - context.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return await cachingRetryHandler.ExecuteWithRetryAsync(async token => { - var ctx = new Polly.Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHttpResponseMessageHandler()); var response = await cachingRetryHandler.ExecuteWithRetryAsync(async ct => { //request must be recreated in every retry. @@ -199,10 +182,10 @@ protected async Task HttpGetRequestWithRetryPolicyAsync(Uri requestUrl, var httpResponse = await HttpClient.SendAsync(httpRequest, ct); return httpResponse.EnsureSuccessStatusCode(); - }, token, ctx); + }, token, LogRetryErrorOnRetryHttpResponseMessageHandler()); return await ReadResponseBodyAsync(response, token); - }, cancellationToken, context); + }, cancellationToken, RetryHandler.LogRetryErrorOnRetryHandler(Logger)); } /// @@ -264,9 +247,7 @@ private async Task CreateGetHttpRequest(Uri requestUrl, bool protected async Task ReadResponseBodyAsync(HttpResponseMessage response, CancellationToken cancellationToken) { - var ctx = new Polly.Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return await cachingRetryHandler.ExecuteWithRetryAsync(response.Content.ReadAsStringAsync, cancellationToken, ctx); + return await cachingRetryHandler.ExecuteWithRetryAsync(response.Content.ReadAsStringAsync, cancellationToken, RetryHandler.LogRetryErrorOnRetryHandler(Logger)); } private async Task AddAuthorizationHeaderToRequestAsync(HttpRequestMessage requestMessage, CancellationToken cancellationToken) diff --git a/src/Tes.ApiClients/RetryHandler.cs b/src/Tes.ApiClients/RetryHandler.cs index c6a14b203..689959570 100644 --- a/src/Tes.ApiClients/RetryHandler.cs +++ b/src/Tes.ApiClients/RetryHandler.cs @@ -2,6 +2,7 @@ // Licensed under the MIT License. using System.Net; +using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Polly; using Polly.Extensions.Http; @@ -19,6 +20,18 @@ public class RetryHandler private readonly AsyncRetryPolicy asyncRetryPolicy = null!; private readonly AsyncRetryPolicy asyncHttpRetryPolicy = null!; + /// + /// An opinionated generic logging retry handler. + /// + /// to use. + /// Name of method originating the retriable operation. + /// + public static OnRetryHandler LogRetryErrorOnRetryHandler(ILogger logger, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + return new((exception, timeSpan, retryCount, correlationId) => + logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan} CorrelationId: {CorrelationId}", caller, retryCount, timeSpan, correlationId.ToString("D"))); + } + /// /// The key in where or is stored. /// @@ -68,7 +81,7 @@ public RetryHandler(IOptions retryPolicyOptions) .OrResult(r => r.StatusCode == HttpStatusCode.TooManyRequests) .WaitAndRetryAsync(retryPolicyOptions.Value.MaxRetryCount, (attempt) => TimeSpan.FromSeconds(Math.Pow(retryPolicyOptions.Value.ExponentialBackOffExponent, - attempt)), OnRetry); + attempt)), OnRetry); } public static void OnRetry(DelegateResult result, TimeSpan span, int retryCount, Context ctx) @@ -91,18 +104,53 @@ protected RetryHandler() { } /// public virtual AsyncRetryPolicy AsyncRetryPolicy => asyncRetryPolicy; + /// + /// Configures to use if provided. + /// + /// Return type of asynchronous delegate. + /// to use. Defaults to none. + /// + public static Context PrepareContext(OnRetryHandler onRetry = default) + { + var context = new Context(); + + if (onRetry is not null) + { + context.SetOnRetryHandler(onRetry); + } + + return context; + } + + /// + /// Configures to use if provided. + /// + /// to use. Defaults to none. + /// for the retriable operation. + public static Context PrepareContext(OnRetryHandler onRetry = default) + { + var context = new Context(); + + if (onRetry is not null) + { + context.SetOnRetryHandler(onRetry); + } + + return context; + } + /// /// Executes a delegate with the specified policy. /// - /// Action to execute - /// + /// Action to execute. + /// to use. Defaults to none. /// Result type /// Result instance - public TResult ExecuteWithRetry(Func action, Context? context = default) + public TResult ExecuteWithRetry(Func action, OnRetryHandler onRetry = default) { ArgumentNullException.ThrowIfNull(action); - return retryPolicy.Execute(_ => action(), context ?? new()); + return retryPolicy.Execute(_ => action(), PrepareContext(onRetry)); } /// @@ -110,14 +158,14 @@ public TResult ExecuteWithRetry(Func action, Context? context /// /// Action to execute /// A for controlling the lifetime of the asynchronous operation. - /// + /// to use. Defaults to none. /// Result type /// Result instance - public virtual Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, Context? context = default) + public virtual Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, OnRetryHandler onRetry = default) { ArgumentNullException.ThrowIfNull(action); - return asyncRetryPolicy.ExecuteAsync((_, ct) => action(ct), context ?? new(), cancellationToken); + return asyncRetryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(onRetry), cancellationToken); } /// @@ -125,13 +173,13 @@ public virtual Task ExecuteWithRetryAsync(Func /// Action to execute /// A for controlling the lifetime of the asynchronous operation. - /// + /// to use. Defaults to none. /// Result instance - public async Task ExecuteWithRetryAsync(Func action, CancellationToken cancellationToken, Context? context = default) + public async Task ExecuteWithRetryAsync(Func action, CancellationToken cancellationToken, OnRetryHandler onRetry = default) { ArgumentNullException.ThrowIfNull(action); - await asyncRetryPolicy.ExecuteAsync((_, ct) => action(ct), context ?? new(), cancellationToken); + await asyncRetryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(onRetry), cancellationToken); } /// @@ -139,13 +187,13 @@ public async Task ExecuteWithRetryAsync(Func action, Ca /// /// Action to execute /// A for controlling the lifetime of the asynchronous operation. - /// + /// to use. Defaults to none. /// Result HttpResponse - public virtual async Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, Context? context = default) + public virtual async Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, OnRetryHandler onRetry = default) { ArgumentNullException.ThrowIfNull(action); - return await asyncHttpRetryPolicy.ExecuteAsync((_, ct) => action(ct), context ?? new(), cancellationToken); + return await asyncHttpRetryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(onRetry), cancellationToken); } } @@ -156,7 +204,7 @@ public static void SetOnRetryHandler(this Context context, RetryHandler.OnRet context[RetryHandler.OnRetryHandlerKey] = onRetry; } - public static RetryHandler.OnRetryHandler? GetOnRetryHandler(this Context context) + public static RetryHandler.OnRetryHandler GetOnRetryHandler(this Context context) { return context.TryGetValue(RetryHandler.OnRetryHandlerKey, out var handler) ? (RetryHandler.OnRetryHandler)handler : default; } @@ -166,7 +214,7 @@ public static void SetOnRetryHandler(this Context context, RetryHandler.OnRetryH context[RetryHandler.OnRetryHandlerKey] = onRetry; } - public static RetryHandler.OnRetryHandler? GetOnRetryHandler(this Context context) + public static RetryHandler.OnRetryHandler GetOnRetryHandler(this Context context) { return context.TryGetValue(RetryHandler.OnRetryHandlerKey, out var handler) ? (RetryHandler.OnRetryHandler)handler : default; } diff --git a/src/Tes.ApiClients/TerraWsmApiClient.cs b/src/Tes.ApiClients/TerraWsmApiClient.cs index c22308491..155a6d32b 100644 --- a/src/Tes.ApiClients/TerraWsmApiClient.cs +++ b/src/Tes.ApiClients/TerraWsmApiClient.cs @@ -200,11 +200,11 @@ private string GetCreateBatchPoolUrl(Guid workspaceId) return builder.Uri.AbsoluteUri; } - private HttpContent GetBatchPoolRequestContent(ApiCreateBatchPoolRequest apiCreateBatchPool) + private static HttpContent GetBatchPoolRequestContent(ApiCreateBatchPoolRequest apiCreateBatchPool) => new StringContent(JsonSerializer.Serialize(apiCreateBatchPool, new JsonSerializerOptions() { DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull }), Encoding.UTF8, "application/json"); - private string ToQueryString(SasTokenApiParameters sasTokenApiParameters) + private static string ToQueryString(SasTokenApiParameters sasTokenApiParameters) => AppendQueryStringParams( ParseQueryStringParameter("sasIpRange", sasTokenApiParameters.SasIpRange), ParseQueryStringParameter("sasExpirationDuration", sasTokenApiParameters.SasExpirationInSeconds.ToString()), diff --git a/src/TesApi.Tests/ContainerRegistryProviderTests.cs b/src/TesApi.Tests/ContainerRegistryProviderTests.cs index 3a94ab463..77e5af610 100644 --- a/src/TesApi.Tests/ContainerRegistryProviderTests.cs +++ b/src/TesApi.Tests/ContainerRegistryProviderTests.cs @@ -51,7 +51,7 @@ public async Task GetContainerRegistryInfoAsync_ServerIsAccessible_ReturnsAndAdd var server = "registry.com"; var image = $"{server}/image"; retryHandlerMock.Setup(r => - r.ExecuteWithRetryAsync(It.IsAny>>>(), It.IsAny(), It.IsAny())) + r.ExecuteWithRetryAsync(It.IsAny>>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(new List() { new ContainerRegistryInfo() { RegistryServer = server } @@ -81,7 +81,7 @@ public async Task GetContainerRegistryInfoAsync_ServerInCache_ReturnsRegistryInf Assert.AreEqual(server, container.RegistryServer); appCacheMock.Verify(c => c.TryGetValue(It.Is(v => $"{nameof(ContainerRegistryProvider)}:{image}".Equals(v)), out It.Ref.IsAny), Times.Once()); retryHandlerMock.Verify(r => - r.ExecuteWithRetryAsync(It.IsAny>>>(), It.IsAny(), It.IsAny()), Times.Never); + r.ExecuteWithRetryAsync(It.IsAny>>>(), It.IsAny(), It.IsAny()), Times.Never); } [TestMethod] @@ -103,7 +103,7 @@ public async Task GetContainerRegistryInfoAsync_NoAccessibleServerNoServerCached retryHandlerMock.Setup(r => r.ExecuteWithRetryAsync( It.IsAny>>>(), - It.IsAny(), It.IsAny())) + It.IsAny(), It.IsAny())) .ReturnsAsync(new List() { new ContainerRegistryInfo() { RegistryServer = server } diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 6433d7f5c..657e7d979 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -268,11 +268,10 @@ public async Task CreateAutoPoolModeBatchJobAsync(string jobId, CloudTask cloudT try { logger.LogInformation($"TES task: {cloudTask.Id} adding task to job."); - var ctx = new Context(); - ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenJobNotFound(LogRetryErrorOnRetryHandler())); job = await batchRetryPolicy.ExecuteAsync((_, ct) => batchClient.JobOperations.GetJobAsync(job.Id, cancellationToken: ct), - ctx, cancellationToken); + RetryHandler.PrepareContext(OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenJobNotFound(LogRetryErrorOnRetryHandler())), + cancellationToken); await job.AddTaskAsync(cloudTask, cancellationToken: cancellationToken); logger.LogInformation($"TES task: {cloudTask.Id} added task successfully."); @@ -316,11 +315,10 @@ public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, PoolI ArgumentException.ThrowIfNullOrEmpty(poolInformation?.PoolId, nameof(poolInformation)); logger.LogInformation("TES task: {TesTask} - Adding task to job {BatchJob}", tesTaskId, poolInformation.PoolId); - var ctx = new Context(); - ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenJobNotFound(LogRetryErrorOnRetryHandler())); var job = await batchRetryPolicy.ExecuteAsync((_, ct) => batchClient.JobOperations.GetJobAsync(poolInformation.PoolId, cancellationToken: ct), - ctx, cancellationToken); + RetryHandler.PrepareContext(OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenJobNotFound(LogRetryErrorOnRetryHandler())), + cancellationToken); await job.AddTaskAsync(cloudTask, cancellationToken: cancellationToken); logger.LogInformation("TES task: {TesTask} - Added task successfully", tesTaskId); @@ -564,9 +562,9 @@ public async Task DeleteBatchTaskAsync(string tesTaskId, PoolInformation pool, C foreach (var task in batchTasksToDelete) { logger.LogInformation("Deleting task {BatchTask}", task.Id); - var ctx = new Context(); - ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenNodeNotReady(LogRetryErrorOnRetryHandler())); - await batchRetryPolicy.ExecuteAsync((_, ct) => task.DeleteAsync(cancellationToken: ct), ctx, cancellationToken); + await batchRetryPolicy.ExecuteAsync((_, ct) => task.DeleteAsync(cancellationToken: ct), + RetryHandler.PrepareContext(OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenNodeNotReady(LogRetryErrorOnRetryHandler())), + cancellationToken); } } diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index ac8f8b246..37ca9846d 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -88,18 +88,6 @@ private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatch OnRetry?.Invoke(outcome, timespan, retryCount, correlationId); }); - /// - /// A logging Polly retry handler. - /// - /// Calling method name. - /// - private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string caller = default) - => new((exception, timeSpan, retryCount, correlationId) => - { - logger?.LogError(exception, @"Retrying in {Method} due to '{Message}': RetryCount: {RetryCount} RetryCount: {TimeSpan} CorrelationId: {CorrelationId}", - caller, exception.Message, retryCount, timeSpan.ToString("c"), correlationId.ToString("D")); - }); - /// public Task CreateAutoPoolModeBatchJobAsync(string jobId, CloudTask cloudTask, PoolInformation poolInformation, CancellationToken cancellationToken) => azureProxy.CreateAutoPoolModeBatchJobAsync(jobId, cloudTask, poolInformation, cancellationToken); @@ -109,9 +97,8 @@ public async Task CreateBatchJobAsync(PoolInformation poolInformation, Cancellat { try { - var ctx = new Context(); - ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(LogRetryErrorOnRetryHandler())); - await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.CreateBatchJobAsync(poolInformation, ct), cancellationToken, ctx); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.CreateBatchJobAsync(poolInformation, ct), cancellationToken, + OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(RetryHandler.LogRetryErrorOnRetryHandler(logger))); } catch (BatchException exc) when (BatchErrorCodeStrings.JobExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } @@ -122,9 +109,8 @@ public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, PoolI { try { - var ctx = new Context(); - ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(LogRetryErrorOnRetryHandler())); - await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.AddBatchTaskAsync(tesTaskId, cloudTask, poolInformation, ct), cancellationToken, ctx); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.AddBatchTaskAsync(tesTaskId, cloudTask, poolInformation, ct), cancellationToken, + OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(RetryHandler.LogRetryErrorOnRetryHandler(logger))); } catch (BatchException exc) when (BatchErrorCodeStrings.TaskExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } @@ -135,9 +121,8 @@ public async Task DeleteBatchJobAsync(PoolInformation poolInformation, Cancellat { try { - var ctx = new Context(); - ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(LogRetryErrorOnRetryHandler())); - await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(poolInformation, ct), cancellationToken, ctx); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(poolInformation, ct), cancellationToken, + OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(RetryHandler.LogRetryErrorOnRetryHandler(logger))); } catch (BatchException exc) when (BatchErrorCodeStrings.JobNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } @@ -148,9 +133,8 @@ public async Task DeleteBatchJobAsync(string taskId, CancellationToken cancellat { try { - var ctx = new Context(); - ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(LogRetryErrorOnRetryHandler())); - await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(taskId, ct), cancellationToken, ctx); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(taskId, ct), cancellationToken, + OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(RetryHandler.LogRetryErrorOnRetryHandler(logger))); } catch (BatchException exc) when (BatchErrorCodeStrings.JobNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } @@ -161,9 +145,8 @@ public async Task DeleteBatchTaskAsync(string taskId, PoolInformation poolInform { try { - var ctx = new Context(); - ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(LogRetryErrorOnRetryHandler())); - await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(taskId, poolInformation, ct), cancellationToken, ctx); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(taskId, poolInformation, ct), cancellationToken, + OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(RetryHandler.LogRetryErrorOnRetryHandler(logger))); } catch (BatchException exc) when (BatchErrorCodeStrings.TaskNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } @@ -174,9 +157,8 @@ public async Task DeleteBatchPoolAsync(string poolId, CancellationToken cancella { try { - var ctx = new Context(); - ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(LogRetryErrorOnRetryHandler())); - await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchPoolAsync(poolId, ct), cancellationToken, ctx); + await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchPoolAsync(poolId, ct), cancellationToken, + OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(RetryHandler.LogRetryErrorOnRetryHandler(logger))); } catch (BatchException exc) when (BatchErrorCodeStrings.TaskNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } @@ -185,17 +167,15 @@ public async Task DeleteBatchPoolAsync(string poolId, CancellationToken cancella /// public Task GetBatchPoolAsync(string poolId, CancellationToken cancellationToken, DetailLevel detailLevel) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchPoolAsync(poolId, ct, detailLevel), cancellationToken, ctx); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchPoolAsync(poolId, ct, detailLevel), cancellationToken, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public Task GetBatchJobAsync(string jobId, CancellationToken cancellationToken, DetailLevel detailLevel) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAsync(jobId, ct, detailLevel), cancellationToken, ctx); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAsync(jobId, ct, detailLevel), cancellationToken, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// @@ -208,91 +188,80 @@ public async Task DeleteBatchComputeNodesAsync(string poolId, IEnumerable public Task DownloadBlobAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DownloadBlobAsync(blobAbsoluteUri, ct), cancellationToken, ctx); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DownloadBlobAsync(blobAbsoluteUri, ct), cancellationToken, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public Task BlobExistsAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.BlobExistsAsync(blobAbsoluteUri, ct), cancellationToken, ctx); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.BlobExistsAsync(blobAbsoluteUri, ct), cancellationToken, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public Task> GetActivePoolIdsAsync(string prefix, TimeSpan minAge, CancellationToken cancellationToken) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetActivePoolIdsAsync(prefix, minAge, ct), cancellationToken, ctx); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetActivePoolIdsAsync(prefix, minAge, ct), cancellationToken, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public IAsyncEnumerable GetActivePoolsAsync(string hostName) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetActivePoolsAsync(hostName), ctx); + return cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetActivePoolsAsync(hostName), + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public int GetBatchActiveJobCount() { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActiveJobCount, ctx); + return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActiveJobCount, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public IEnumerable GetBatchActiveNodeCountByVmSize() { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActiveNodeCountByVmSize, ctx); + return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActiveNodeCountByVmSize, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public int GetBatchActivePoolCount() { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActivePoolCount, ctx); + return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActivePoolCount, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public Task GetBatchJobAndTaskStateAsync(Tes.Models.TesTask tesTask, bool usingAutoPools, CancellationToken cancellationToken) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAndTaskStateAsync(tesTask, usingAutoPools, ct), cancellationToken, ctx); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAndTaskStateAsync(tesTask, usingAutoPools, ct), cancellationToken, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public Task GetNextBatchJobIdAsync(string tesTaskId, CancellationToken cancellationToken) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetNextBatchJobIdAsync(tesTaskId, ct), cancellationToken, ctx); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetNextBatchJobIdAsync(tesTaskId, ct), cancellationToken, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public Task> GetPoolIdsReferencedByJobsAsync(CancellationToken cancellationToken) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetryAsync(azureProxy.GetPoolIdsReferencedByJobsAsync, cancellationToken, ctx); + return cachingRetryHandler.ExecuteWithRetryAsync(azureProxy.GetPoolIdsReferencedByJobsAsync, cancellationToken, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public Task GetStorageAccountKeyAsync(StorageAccountInfo storageAccountInfo, CancellationToken cancellationToken) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetryAndCachingAsync($"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountInfo.Id}", - ct => azureProxy.GetStorageAccountKeyAsync(storageAccountInfo, ct), DateTimeOffset.Now.AddHours(1), cancellationToken, ctx); + ct => azureProxy.GetStorageAccountKeyAsync(storageAccountInfo, ct), DateTimeOffset.Now.AddHours(1), cancellationToken, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// @@ -302,9 +271,8 @@ public async Task GetStorageAccountInfoAsync(string storageA if (storageAccountInfo is null) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - storageAccountInfo = await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetStorageAccountInfoAsync(storageAccountName, ct), cancellationToken, ctx); + storageAccountInfo = await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetStorageAccountInfoAsync(storageAccountName, ct), cancellationToken, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); if (storageAccountInfo is not null) { @@ -318,49 +286,43 @@ public async Task GetStorageAccountInfoAsync(string storageA /// public Task> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListBlobsAsync(directoryUri, ct), cancellationToken, ctx); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListBlobsAsync(directoryUri, ct), cancellationToken, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public Task> ListOldJobsToDeleteAsync(TimeSpan oldestJobAge, CancellationToken cancellationToken) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListOldJobsToDeleteAsync(oldestJobAge, ct), cancellationToken, ctx); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListOldJobsToDeleteAsync(oldestJobAge, ct), cancellationToken, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public Task> ListOrphanedJobsToDeleteAsync(TimeSpan minJobAge, CancellationToken cancellationToken) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListOrphanedJobsToDeleteAsync(minJobAge, ct), cancellationToken, ctx); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListOrphanedJobsToDeleteAsync(minJobAge, ct), cancellationToken, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public Task UploadBlobAsync(Uri blobAbsoluteUri, string content, CancellationToken cancellationToken) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobAsync(blobAbsoluteUri, content, ct), cancellationToken, ctx); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobAsync(blobAbsoluteUri, content, ct), cancellationToken, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public Task UploadBlobFromFileAsync(Uri blobAbsoluteUri, string filePath, CancellationToken cancellationToken) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobFromFileAsync(blobAbsoluteUri, filePath, ct), cancellationToken, ctx); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobFromFileAsync(blobAbsoluteUri, filePath, ct), cancellationToken, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBlobPropertiesAsync(blobAbsoluteUri, ct), cancellationToken, ctx); + return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBlobPropertiesAsync(blobAbsoluteUri, ct), cancellationToken, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// @@ -377,9 +339,8 @@ public async Task CreateBatchPoolAsync(BatchModels.Pool poolInf { try { - var ctx = new Context(); - ctx.SetOnRetryHandler(OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(LogRetryErrorOnRetryHandler())); - return await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.CreateBatchPoolAsync(poolInfo, isPreemptable, ct), cancellationToken, ctx); + return await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.CreateBatchPoolAsync(poolInfo, isPreemptable, ct), cancellationToken, + OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(RetryHandler.LogRetryErrorOnRetryHandler(logger))); } catch (BatchException exc) when (BatchErrorCodeStrings.PoolExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { @@ -394,35 +355,32 @@ public Task DeleteBatchPoolIfExistsAsync(string poolId, CancellationToken cancel /// public Task GetFullAllocationStateAsync(string poolId, CancellationToken cancellationToken) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); return cachingRetryHandler.ExecuteWithRetryAndCachingAsync( $"{nameof(CachingWithRetriesAzureProxy)}:{poolId}", ct => azureProxy.GetFullAllocationStateAsync(poolId, ct), DateTimeOffset.Now.Add(BatchPoolService.RunInterval).Subtract(TimeSpan.FromSeconds(1)), - cancellationToken, ctx); + cancellationToken, RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public IAsyncEnumerable ListComputeNodesAsync(string poolId, DetailLevel detailLevel) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListComputeNodesAsync(poolId, detailLevel), cachingRetryHandler.RetryPolicy, ctx); + return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListComputeNodesAsync(poolId, detailLevel), cachingRetryHandler.RetryPolicy, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public IAsyncEnumerable ListTasksAsync(string jobId, DetailLevel detailLevel) { - var ctx = new Context(); - ctx.SetOnRetryHandler(LogRetryErrorOnRetryHandler()); - return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListTasksAsync(jobId, detailLevel), cachingRetryHandler.RetryPolicy, ctx); + return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListTasksAsync(jobId, detailLevel), cachingRetryHandler.RetryPolicy, + RetryHandler.LogRetryErrorOnRetryHandler(logger)); } /// public Task DisableBatchPoolAutoScaleAsync(string poolId, CancellationToken cancellationToken) => azureProxy.DisableBatchPoolAutoScaleAsync(poolId, cancellationToken); /// - public Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeSpan interval, IAzureProxy.BatchPoolAutoScaleFormulaFactory formulaFactory, CancellationToken cancellationToken) => azureProxy.EnableBatchPoolAutoScaleAsync(poolId, preemptable, interval, formulaFactory, cancellationToken); + public Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeSpan interval, IAzureProxy.BatchPoolAutoScaleFormulaFactory formulaFactory, CancellationToken cancellationToken) + => azureProxy.EnableBatchPoolAutoScaleAsync(poolId, preemptable, interval, formulaFactory, cancellationToken); } } diff --git a/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs b/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs index d594cb0ac..c2c3eef2c 100644 --- a/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs +++ b/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs @@ -57,16 +57,16 @@ public static IAsyncEnumerable ToAsyncEnumerable(this IPage source, Fun /// Policy retrying calls made while enumerating results returned by . /// Method returning . /// Policy retrying call to . - /// An optional . + /// to use. Defaults to none. Assumes policies configured like the ones in . /// - public static IAsyncEnumerable ExecuteAsync(this AsyncRetryPolicy asyncRetryPolicy, Func> func, RetryPolicy retryPolicy, Polly.Context ctx = default) + public static IAsyncEnumerable ExecuteAsync(this AsyncRetryPolicy asyncRetryPolicy, Func> func, RetryPolicy retryPolicy, Tes.ApiClients.RetryHandler.OnRetryHandler onRetry = default) { ArgumentNullException.ThrowIfNull(asyncRetryPolicy); ArgumentNullException.ThrowIfNull(func); ArgumentNullException.ThrowIfNull(retryPolicy); - ctx ??= new(); - return new PollyAsyncEnumerable((retryPolicy).Execute(() => func()), asyncRetryPolicy, ctx); + var ctx = Tes.ApiClients.RetryHandler.PrepareContext(onRetry); + return new PollyAsyncEnumerable(retryPolicy.Execute(_ => func(), ctx), asyncRetryPolicy, ctx); } #region Implementation classes diff --git a/src/TesApi.Web/Management/ContainerRegistryProvider.cs b/src/TesApi.Web/Management/ContainerRegistryProvider.cs index fb9810d8b..cba4a3a63 100644 --- a/src/TesApi.Web/Management/ContainerRegistryProvider.cs +++ b/src/TesApi.Web/Management/ContainerRegistryProvider.cs @@ -99,11 +99,7 @@ public bool IsImagePublic(string imageName) private async Task LookUpAndAddToCacheContainerRegistryInfoAsync(string imageName, CancellationToken cancellationToken) { - var ctx = new Polly.Context(); - ctx.SetOnRetryHandler((exception, timespan, retryCount, correlationId) => - Logger.LogError(exception, @"Retrying in {Method} due to '{Message}': RetryCount: {RetryCount} RetryCount: {TimeSpan} CorrelationId: {CorrelationId}", - nameof(LookUpAndAddToCacheContainerRegistryInfoAsync), exception.Message, retryCount, timespan.ToString("c"), correlationId.ToString("D"))); - var repositories = await CachingRetryHandler.ExecuteWithRetryAsync(GetAccessibleContainerRegistriesAsync, cancellationToken: cancellationToken, context: ctx); + var repositories = await CachingRetryHandler.ExecuteWithRetryAsync(GetAccessibleContainerRegistriesAsync, cancellationToken, RetryHandler.LogRetryErrorOnRetryHandler(Logger)); var requestedRepo = repositories?.FirstOrDefault(reg => reg.RegistryServer.Equals(imageName.Split('/').FirstOrDefault(), StringComparison.OrdinalIgnoreCase)); From 95fc2d93ac291219cb3db314a4ae273de31b0627 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 14 Nov 2023 10:20:33 -0800 Subject: [PATCH 086/202] Remove retries specifically on reading http responses --- src/Tes.ApiClients/HttpApiClient.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index 1d3a9cdd2..608c8a60e 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -245,9 +245,9 @@ private async Task CreateGetHttpRequest(Uri requestUrl, bool return httpRequest; } - protected async Task ReadResponseBodyAsync(HttpResponseMessage response, CancellationToken cancellationToken) + protected static async Task ReadResponseBodyAsync(HttpResponseMessage response, CancellationToken cancellationToken) { - return await cachingRetryHandler.ExecuteWithRetryAsync(response.Content.ReadAsStringAsync, cancellationToken, RetryHandler.LogRetryErrorOnRetryHandler(Logger)); + return await response.Content.ReadAsStringAsync(cancellationToken); } private async Task AddAuthorizationHeaderToRequestAsync(HttpRequestMessage requestMessage, CancellationToken cancellationToken) From 2add67361dc9492208a35f2a91e4b2ec21fb98b0 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 14 Nov 2023 10:56:52 -0800 Subject: [PATCH 087/202] Fix imports ordering --- src/TesApi.Web/PoolScheduler.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index a77d0632e..a5b7dc8c4 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -7,8 +7,8 @@ using System.Linq; using System.Threading; using System.Threading.Tasks; -using Microsoft.Azure.Batch.Common; using Microsoft.Azure.Batch; +using Microsoft.Azure.Batch.Common; using Microsoft.Extensions.Logging; using Tes.Models; using Tes.Repository; From 3c16567411d70edb71a61b367e985178afd24427 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 15 Nov 2023 14:11:50 -0800 Subject: [PATCH 088/202] fix predicates --- src/TesApi.Web/PoolScheduler.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index a5b7dc8c4..3378bd32e 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -121,7 +121,7 @@ private async ValueTask ProcessTasksAsync(IBatchPool pool, DateTime now, IAsyncE var deletionCandidateTasks = AsyncEnumerable.Empty(); var deletionCandidateCreationCutoff = now - TimeSpan.FromMinutes(10); - var stateTransitionTimeCutoff = now + TimeSpan.FromMinutes(3); + var stateTransitionTimeCutoffForDeletions = now - TimeSpan.FromMinutes(3); // the value of the timespan should be larger than the corresponding value in CompletedTaskListPredicate, but doesn't have to be await foreach (var task in tasks.WithCancellation(cancellationToken)) { @@ -131,7 +131,7 @@ private async ValueTask ProcessTasksAsync(IBatchPool pool, DateTime now, IAsyncE batchStateCandidateTasks = batchStateCandidateTasks.Append(task); } - if (TaskState.Completed.Equals(task.State) && task.CreationTime < deletionCandidateCreationCutoff && task.StateTransitionTime > stateTransitionTimeCutoff) + if (TaskState.Completed.Equals(task.State) && task.CreationTime < deletionCandidateCreationCutoff && task.StateTransitionTime > stateTransitionTimeCutoffForDeletions) { deletionCandidateTasks = deletionCandidateTasks.Append(new IBatchScheduler.CloudTaskId(pool.Id, task.Id, task.CreationTime.Value)); } @@ -150,12 +150,12 @@ private async ValueTask ProcessTasksAsync(IBatchPool pool, DateTime now, IAsyncE /// /// Shared between and . /// - private static bool ActiveTaskListPredicate(CloudTask task) => !TaskState.Active.Equals(task.State); + private static bool ActiveTaskListPredicate(CloudTask task) => TaskState.Active.Equals(task.State); /// /// Shared between and . /// - private static bool CompletedTaskListPredicate(CloudTask task, DateTime now) => !TaskState.Completed.Equals(task.State) && task.StateTransitionTime < now - TimeSpan.FromMinutes(2); + private static bool CompletedTaskListPredicate(CloudTask task, DateTime now) => TaskState.Completed.Equals(task.State) && task.StateTransitionTime < now - TimeSpan.FromMinutes(2); /// /// Updates each task based on the provided states. From aecebaa070ca138d069ded42ba02e3d1b3e6585e Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 15 Nov 2023 16:14:39 -0800 Subject: [PATCH 089/202] fix time math around deleting cloud tasks --- src/TesApi.Web/BatchScheduler.cs | 46 ++++++++++++++------------------ src/TesApi.Web/PoolScheduler.cs | 13 +++++---- 2 files changed, 28 insertions(+), 31 deletions(-) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index c38cfcfe0..5526a703b 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -15,15 +15,14 @@ using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Tes.Extensions; -using Tes.Models; using TesApi.Web.Events; using TesApi.Web.Extensions; using TesApi.Web.Management; using TesApi.Web.Management.Models.Quotas; using TesApi.Web.Runner; using TesApi.Web.Storage; -using static TesApi.Web.IBatchScheduler; using BatchModels = Microsoft.Azure.Management.Batch.Models; +using CloudTaskId = TesApi.Web.IBatchScheduler.CloudTaskId; using TesException = Tes.Models.TesException; using TesFileType = Tes.Models.TesFileType; using TesInput = Tes.Models.TesInput; @@ -39,6 +38,11 @@ namespace TesApi.Web /// public partial class BatchScheduler : IBatchScheduler { + /// + /// Time tasks must live before being deleted + /// + // https://learn.microsoft.com/azure/batch/best-practices#manage-task-lifetime + public static TimeSpan BatchDeleteNewTaskWorkaroundTimeSpan = TimeSpan.FromMinutes(10); internal const string PoolHostName = "CoA-TES-HostName"; internal const string PoolIsDedicated = "CoA-TES-IsDedicated"; @@ -63,7 +67,7 @@ public partial class BatchScheduler : IBatchScheduler private readonly IStorageAccessProvider storageAccessProvider; private readonly IBatchQuotaVerifier quotaVerifier; private readonly IBatchSkuInformationProvider skuInformationProvider; - private readonly IList tesTaskStateTransitions; + private readonly IReadOnlyList tesTaskStateTransitions; private readonly bool usePreemptibleVmsOnly; private readonly string batchNodesSubnetId; private readonly bool disableBatchNodesPublicIpAddress; @@ -281,15 +285,9 @@ Tes.Models.TesOutputFileLog ConvertOutputFileLogToTesOutputFileLog(AzureBatchTas } } - async Task SetTaskCompleted(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) - { - await TerminateBatchTaskAsync(tesTask, batchInfo, cancellationToken); - return await SetTaskStateAndLog(tesTask, TesState.COMPLETEEnum, batchInfo, cancellationToken); - } - async Task SetTaskExecutorError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - await TerminateBatchTaskAsync(tesTask, batchInfo, cancellationToken); + // TODO: check task's log to ensure that the error was actually an executor error. Return the correct error state. return await SetTaskStateAndLog(tesTask, TesState.EXECUTORERROREnum, batchInfo, cancellationToken); } @@ -335,27 +333,23 @@ Task HandleInfoUpdate(TesTask tesTask, CombinedBatchTaskInfo batchInfo, Ca tesTaskStateTransitions = new List() { - new TesTaskStateTransition(condition: null, AzureBatchTaskState.TaskState.CancellationRequested, alternateSystemLogItem: null, TerminateBatchTaskAsync), - new TesTaskStateTransition(tesTaskIsInitializing, AzureBatchTaskState.TaskState.NodeAllocationFailed, alternateSystemLogItem: null, RequeueTaskAfterFailureAsync), - new TesTaskStateTransition(tesTaskIsInitializing, AzureBatchTaskState.TaskState.NodeStartTaskFailed, "Please open an issue. There should have been an error reported here.", SetTaskSystemError), - new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.Initializing, alternateSystemLogItem: null, (tesTask, info, ct) => SetTaskStateAndLog(tesTask, TesState.INITIALIZINGEnum, info, ct)), - new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.Running, alternateSystemLogItem: null, (tesTask, info, ct) => SetTaskStateAndLog(tesTask, TesState.RUNNINGEnum, info, ct)), - new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedSuccessfully, alternateSystemLogItem: null, SetTaskCompleted), - new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedWithErrors, "Please open an issue. There should have been an error reported here.", SetTaskExecutorError), - new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, "Please open an issue. There should have been an error reported here.", SetTaskSystemError), - new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodePreempted, alternateSystemLogItem: null, HandlePreemptedNode), - new TesTaskStateTransition(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed, alternateSystemLogItem: null, HandleInfoUpdate), - new TesTaskStateTransition(condition: null, AzureBatchTaskState.TaskState.InfoUpdate, alternateSystemLogItem: null, HandleInfoUpdate), + new(condition: null, AzureBatchTaskState.TaskState.CancellationRequested, alternateSystemLogItem: null, TerminateBatchTaskAsync), + new(tesTaskIsInitializing, AzureBatchTaskState.TaskState.NodeAllocationFailed, alternateSystemLogItem: null, RequeueTaskAfterFailureAsync), + new(tesTaskIsInitializing, AzureBatchTaskState.TaskState.NodeStartTaskFailed, "Please open an issue. There should have been an error reported here.", SetTaskSystemError), + new(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.Initializing, alternateSystemLogItem: null, (tesTask, info, ct) => SetTaskStateAndLog(tesTask, TesState.INITIALIZINGEnum, info, ct)), + new(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.Running, alternateSystemLogItem: null, (tesTask, info, ct) => SetTaskStateAndLog(tesTask, TesState.RUNNINGEnum, info, ct)), + new(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedSuccessfully, alternateSystemLogItem: null, (tesTask, info, ct) => SetTaskStateAndLog(tesTask, TesState.COMPLETEEnum, info, ct)), + new(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedWithErrors, "Please open an issue. There should have been an error reported here.", SetTaskExecutorError), + new(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, "Please open an issue. There should have been an error reported here.", SetTaskSystemError), + new(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodePreempted, alternateSystemLogItem: null, HandlePreemptedNode), + new(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed, alternateSystemLogItem: null, HandleInfoUpdate), + new(condition: null, AzureBatchTaskState.TaskState.InfoUpdate, alternateSystemLogItem: null, HandleInfoUpdate), }.AsReadOnly(); } private async Task DeleteCompletedTaskAsync(string taskId, string jobId, DateTime taskCreated, CancellationToken cancellationToken) { - // https://learn.microsoft.com/azure/batch/best-practices#manage-task-lifetime - var mins10 = TimeSpan.FromMinutes(10); - var now = DateTimeOffset.UtcNow; - - if (!(now - taskCreated > mins10)) + if (!(DateTimeOffset.UtcNow < taskCreated.ToUniversalTime() + BatchDeleteNewTaskWorkaroundTimeSpan)) { return false; } diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index 3378bd32e..aac248f46 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -12,7 +12,7 @@ using Microsoft.Extensions.Logging; using Tes.Models; using Tes.Repository; -using static TesApi.Web.IBatchPool; +using CloudTaskBatchTaskState = TesApi.Web.IBatchPool.CloudTaskBatchTaskState; namespace TesApi.Web { @@ -28,6 +28,9 @@ internal class PoolScheduler : OrchestrateOnBatchSchedulerServiceBase /// public static readonly TimeSpan RunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs uses a 30 second polling interval + private static readonly TimeSpan StateTransitionTimeForDeletionTimeSpan = 0.75 * BatchScheduler.BatchDeleteNewTaskWorkaroundTimeSpan; + private static readonly TimeSpan CompletedTaskListTimeSpan = 0.5 * BatchScheduler.BatchDeleteNewTaskWorkaroundTimeSpan; + /// /// Default constructor /// @@ -120,8 +123,8 @@ private async ValueTask ProcessTasksAsync(IBatchPool pool, DateTime now, IAsyncE var batchStateCandidateTasks = AsyncEnumerable.Empty(); var deletionCandidateTasks = AsyncEnumerable.Empty(); - var deletionCandidateCreationCutoff = now - TimeSpan.FromMinutes(10); - var stateTransitionTimeCutoffForDeletions = now - TimeSpan.FromMinutes(3); // the value of the timespan should be larger than the corresponding value in CompletedTaskListPredicate, but doesn't have to be + var deletionCandidateCreationCutoff = now - BatchScheduler.BatchDeleteNewTaskWorkaroundTimeSpan; + var stateTransitionTimeCutoffForDeletions = now - StateTransitionTimeForDeletionTimeSpan; await foreach (var task in tasks.WithCancellation(cancellationToken)) { @@ -131,7 +134,7 @@ private async ValueTask ProcessTasksAsync(IBatchPool pool, DateTime now, IAsyncE batchStateCandidateTasks = batchStateCandidateTasks.Append(task); } - if (TaskState.Completed.Equals(task.State) && task.CreationTime < deletionCandidateCreationCutoff && task.StateTransitionTime > stateTransitionTimeCutoffForDeletions) + if (TaskState.Completed.Equals(task.State) && task.CreationTime < deletionCandidateCreationCutoff && task.StateTransitionTime < stateTransitionTimeCutoffForDeletions) { deletionCandidateTasks = deletionCandidateTasks.Append(new IBatchScheduler.CloudTaskId(pool.Id, task.Id, task.CreationTime.Value)); } @@ -155,7 +158,7 @@ private async ValueTask ProcessTasksAsync(IBatchPool pool, DateTime now, IAsyncE /// /// Shared between and . /// - private static bool CompletedTaskListPredicate(CloudTask task, DateTime now) => TaskState.Completed.Equals(task.State) && task.StateTransitionTime < now - TimeSpan.FromMinutes(2); + private static bool CompletedTaskListPredicate(CloudTask task, DateTime now) => TaskState.Completed.Equals(task.State) && task.StateTransitionTime < now - CompletedTaskListTimeSpan; /// /// Updates each task based on the provided states. From 9c6181a96c2dcb8df47af9b18315d7be849d3f93 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 16 Nov 2023 09:10:29 -0800 Subject: [PATCH 090/202] fix more time math around deleting cloud tasks --- src/TesApi.Web/BatchScheduler.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 5526a703b..49e5b59d0 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -349,7 +349,7 @@ Task HandleInfoUpdate(TesTask tesTask, CombinedBatchTaskInfo batchInfo, Ca private async Task DeleteCompletedTaskAsync(string taskId, string jobId, DateTime taskCreated, CancellationToken cancellationToken) { - if (!(DateTimeOffset.UtcNow < taskCreated.ToUniversalTime() + BatchDeleteNewTaskWorkaroundTimeSpan)) + if (DateTimeOffset.UtcNow <= taskCreated.ToUniversalTime() + BatchDeleteNewTaskWorkaroundTimeSpan) { return false; } From 6046b0459f89824882a8b3ee963e85317eaec032 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 16 Nov 2023 09:10:50 -0800 Subject: [PATCH 091/202] Remove exception thrown during normal process shutdown --- src/Tes/Repository/PostgreSqlCachingRepository.cs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/Tes/Repository/PostgreSqlCachingRepository.cs b/src/Tes/Repository/PostgreSqlCachingRepository.cs index 5705d4477..29e6899c9 100644 --- a/src/Tes/Repository/PostgreSqlCachingRepository.cs +++ b/src/Tes/Repository/PostgreSqlCachingRepository.cs @@ -65,12 +65,16 @@ protected PostgreSqlCachingRepository(Microsoft.Extensions.Hosting.IHostApplicat case TaskStatus.RanToCompletion: Logger.LogCritical("Repository WriterWorkerAsync unexpectedly completed."); break; + case TaskStatus.Canceled: + return; // This is the normal exit for WriterWorkerAsync + default: + Logger.LogCritical(new System.Diagnostics.UnreachableException($"Repository WriterWorkerAsync ended with task status '{task.Status}'"), @"Repository WriterWorkerAsync ended with task status '{TaskStatus}'.", task.Status); + break; } await Task.Delay(50); // Give the logger time to flush. hostApplicationLifetime?.StopApplication(); - }, - TaskContinuationOptions.NotOnCanceled); + }, TaskContinuationOptions.ExecuteSynchronously); } /// @@ -233,6 +237,8 @@ protected virtual void Dispose(bool disposing) } catch (OperationCanceledException ex) when (writerWorkerCancellationTokenSource.Token == ex.CancellationToken) { } // Expected return from Wait(). + + writerWorkerCancellationTokenSource.Dispose(); } _disposedValue = true; From 7b2640e02c064466f23056bc32856f91d4a954de Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 16 Nov 2023 13:51:43 -0800 Subject: [PATCH 092/202] some cleanup --- src/Tes/Models/TesState.cs | 11 ++ src/TesApi.Tests/BatchSchedulerTests.cs | 63 +++++---- src/TesApi.Web/AzureBatchTaskState.cs | 26 +++- src/TesApi.Web/BatchScheduler.cs | 124 +++++++++--------- .../Events/RunnerEventsProcessor.cs | 20 +-- .../OrchestrateOnBatchSchedulerServiceBase.cs | 4 +- src/TesApi.Web/PoolScheduler.cs | 14 +- 7 files changed, 154 insertions(+), 108 deletions(-) diff --git a/src/Tes/Models/TesState.cs b/src/Tes/Models/TesState.cs index bd4695bc3..d506a0749 100644 --- a/src/Tes/Models/TesState.cs +++ b/src/Tes/Models/TesState.cs @@ -38,66 +38,77 @@ public enum TesState /// /// Enum UNKNOWNEnum for UNKNOWN /// + /// The state of the task is unknown. This provides a safe default for messages where this field is missing, for example, so that a missing field does not accidentally imply that the state is QUEUED. [EnumMember(Value = "UNKNOWN")] UNKNOWNEnum = 1, /// /// Enum QUEUEDEnum for QUEUED /// + /// The task is queued. [EnumMember(Value = "QUEUED")] QUEUEDEnum = 2, /// /// Enum INITIALIZINGEnum for INITIALIZING /// + /// The task has been assigned to a worker and is currently preparing to run. For example, the worker may be turning on, downloading input files, etc. [EnumMember(Value = "INITIALIZING")] INITIALIZINGEnum = 3, /// /// Enum RUNNINGEnum for RUNNING /// + /// The task is running. Input files are downloaded and the first Executor has been started. [EnumMember(Value = "RUNNING")] RUNNINGEnum = 4, /// /// Enum PAUSEDEnum for PAUSED /// + /// The task is paused. An implementation may have the ability to pause a task, but this is not required. [EnumMember(Value = "PAUSED")] PAUSEDEnum = 5, /// /// Enum COMPLETEEnum for COMPLETE /// + /// The task has completed running. Executors have exited without error and output files have been successfully uploaded. [EnumMember(Value = "COMPLETE")] COMPLETEEnum = 6, /// /// Enum EXECUTORERROREnum for EXECUTOR_ERROR /// + /// The task encountered an error in one of the Executor processes. Generally, this means that an Executor exited with a non-zero exit code. [EnumMember(Value = "EXECUTOR_ERROR")] EXECUTORERROREnum = 7, /// /// Enum SYSTEMERROREnum for SYSTEM_ERROR /// + /// The task was stopped due to a system error, but not from an Executor, for example an upload failed due to network issues, the worker's ran out of disk space, etc. [EnumMember(Value = "SYSTEM_ERROR")] SYSTEMERROREnum = 8, /// /// Enum CANCELEDEnum for CANCELED /// + /// The task was canceled by the user. [EnumMember(Value = "CANCELED")] CANCELEDEnum = 9, /// /// Enum PREEMPTEDEnum for PREEMPTED /// + /// The task is stopped (preempted) by the system. The reasons for this would be tied to the specific system running the job. Generally, this means that the system reclaimed the compute capacity for reallocation. [EnumMember(Value = "PREEMPTED")] PREEMPTEDEnum = 10, /// /// Enum CANCELINGEnum for CANCELING /// + /// The task was canceled by the user, but the downstream resources are still awaiting deletion. [EnumMember(Value = "CANCELING")] CANCELINGEnum = 11, } diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index e8ab9f5fc..7313c9d55 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -24,7 +24,6 @@ using TesApi.Web.Management; using TesApi.Web.Management.Models.Quotas; using TesApi.Web.Storage; -using static TesApi.Web.Management.IBatchQuotaVerifier; namespace TesApi.Tests { @@ -272,7 +271,7 @@ public async Task TesTaskFailsWhenBatchNodeDiskIsFull() var tesTask = GetTesTask(); tesTask.State = TesState.INITIALIZINGEnum; - (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask, BatchTaskStates.NodeDiskFull); + (var failureReason, var systemLog) = await ProcessTesTaskAndGetFailureReasonAndSystemLogAsync(tesTask, BatchTaskStates.NodeDiskFull[0]); GuardAssertsWithTesTask(tesTask, () => { @@ -817,7 +816,7 @@ public async Task TaskStateTransitionsFromInitializingState() [TestMethod] public async Task TaskStateTransitionsFromQueuedState() { - Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum)); + Assert.AreEqual(TesState.INITIALIZINGEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, new AzureBatchTaskState[] { default })); Assert.AreEqual(TesState.CANCELEDEnum, await GetNewTesTaskStateAsync(TesState.QUEUEDEnum, BatchTaskStates.CancellationRequested)); } @@ -837,13 +836,13 @@ public async Task TaskIsRequeuedUpToThreeTimesForTransientErrors() }; await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); - await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed))); + await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed[0]))); await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); - await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed))); + await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed[0]))); await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); - await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed))); + await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed[0]))); await GetNewTesTaskStateAsync(tesTask, azureProxyReturnValues); - await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed))); + await GuardAssertsWithTesTask(tesTask, async () => Assert.AreEqual(TesState.SYSTEMERROREnum, await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed[0]))); } [TestMethod] @@ -852,11 +851,11 @@ public async Task TaskThatFailsWithNodeAllocationErrorIsRequeuedOnDifferentVmSiz var tesTask = GetTesTask(); await GetNewTesTaskStateAsync(tesTask); - await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed); + await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed[0]); var firstAttemptVmSize = tesTask.Logs[0].VirtualMachineInfo.VmSize; await GetNewTesTaskStateAsync(tesTask); - await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed); + await GetNewTesTaskStateAsync(tesTask, BatchTaskStates.NodeAllocationFailed[0]); var secondAttemptVmSize = tesTask.Logs[1].VirtualMachineInfo.VmSize; GuardAssertsWithTesTask(tesTask, () => Assert.AreNotEqual(firstAttemptVmSize, secondAttemptVmSize)); @@ -877,7 +876,7 @@ public async Task TaskGetsCancelled() var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELINGEnum, Logs = new() { new() } }; var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - azureProxyReturnValues.BatchTaskState = BatchTaskStates.CancellationRequested; + azureProxyReturnValues.BatchTaskState = BatchTaskStates.CancellationRequested[0]; Mock azureProxy = default; var azureProxySetter = new Action>(mock => { @@ -954,7 +953,7 @@ public async Task SuccessfullyCompletedTaskContainsBatchNodeMetrics() DiskUsedInKiB=1000000".Replace(" ", string.Empty); var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - azureProxyReturnValues.BatchTaskState = BatchTaskStates.TaskCompletedSuccessfully; + azureProxyReturnValues.BatchTaskState = BatchTaskStates.TaskCompletedSuccessfully[0]; azureProxyReturnValues.DownloadedBlobContent = metricsFileContent; _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); @@ -989,7 +988,7 @@ public async Task SuccessfullyCompletedTaskContainsCromwellResultCode() tesTask.State = TesState.INITIALIZINGEnum; var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - azureProxyReturnValues.BatchTaskState = BatchTaskStates.TaskCompletedSuccessfully; + azureProxyReturnValues.BatchTaskState = BatchTaskStates.TaskCompletedSuccessfully[0]; azureProxyReturnValues.DownloadedBlobContent = "2"; var azureProxy = GetMockAzureProxy(azureProxyReturnValues); @@ -1638,8 +1637,18 @@ private static async Task GetNewTesTaskStateAsync(TesTask tesTask, Azu return tesTask.State; } - private static Task GetNewTesTaskStateAsync(TesState currentTesTaskState, AzureBatchTaskState batchTaskState = default) - => GetNewTesTaskStateAsync(new TesTask { Id = "test", State = currentTesTaskState, Executors = new() { new() { Image = "imageName1", Command = new() { "command" } } } }, batchTaskState); + private static async Task GetNewTesTaskStateAsync(TesState currentTesTaskState, IEnumerable batchTaskStates) + { + var tesTask = new TesTask { Id = "test", State = currentTesTaskState, Executors = new() { new() { Image = "imageName1", Command = new() { "command" } } } }; + TesState result = default; + + foreach (var batchTaskState in batchTaskStates) + { + result = await GetNewTesTaskStateAsync(tesTask, batchTaskState); + } + + return result; + } private static Task GetNewTesTaskStateAsync(TesTask tesTask, AzureBatchTaskState batchTaskState = default) { @@ -1837,17 +1846,21 @@ internal static async ValueTask GuardAssertsWithTesTask(TesTask tesTask, Func new(AzureBatchTaskState.TaskState.InfoUpdate); - public static AzureBatchTaskState TaskPreparing => new(AzureBatchTaskState.TaskState.Initializing, CloudTaskCreationTime: DateTimeOffset.UtcNow); - public static AzureBatchTaskState TaskRunning => new(AzureBatchTaskState.TaskState.Running, CloudTaskCreationTime: DateTimeOffset.UtcNow - TimeSpan.FromMinutes(6)); - public static AzureBatchTaskState TaskCompletedSuccessfully => new(AzureBatchTaskState.TaskState.CompletedSuccessfully, BatchTaskExitCode: 0); - public static AzureBatchTaskState TaskFailed => new(AzureBatchTaskState.TaskState.CompletedWithErrors, BatchTaskExitCode: -1); - public static AzureBatchTaskState NodeDiskFull => new(AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, Failure: new("DiskFull", new[] { "Error message." })); - public static AzureBatchTaskState UploadOrDownloadFailed => new(AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed); - public static AzureBatchTaskState NodeAllocationFailed => new(AzureBatchTaskState.TaskState.NodeAllocationFailed, Failure: new(AzureBatchTaskState.TaskState.NodeAllocationFailed.ToString(), new[] { "Error message." })); - public static AzureBatchTaskState NodePreempted => new(AzureBatchTaskState.TaskState.NodePreempted); - public static AzureBatchTaskState NodeStartTaskFailed => new(AzureBatchTaskState.TaskState.NodeStartTaskFailed); - public static AzureBatchTaskState CancellationRequested => new(AzureBatchTaskState.TaskState.CancellationRequested, CloudTaskCreationTime: DateTimeOffset.UtcNow - TimeSpan.FromMinutes(12)); + public static AzureBatchTaskState[] TaskActive => new[] { new AzureBatchTaskState(AzureBatchTaskState.TaskState.InfoUpdate) }; + public static AzureBatchTaskState[] TaskPreparing => new[] { new AzureBatchTaskState(AzureBatchTaskState.TaskState.Initializing, CloudTaskCreationTime: DateTimeOffset.UtcNow) }; + public static AzureBatchTaskState[] TaskRunning => new[] { new AzureBatchTaskState(AzureBatchTaskState.TaskState.Running, CloudTaskCreationTime: DateTimeOffset.UtcNow - TimeSpan.FromMinutes(6)) }; + public static AzureBatchTaskState[] TaskCompletedSuccessfully => new[] { new AzureBatchTaskState(AzureBatchTaskState.TaskState.CompletedSuccessfully, BatchTaskExitCode: 0) }; + public static AzureBatchTaskState[] TaskFailed => new[] + { + new AzureBatchTaskState(AzureBatchTaskState.TaskState.InfoUpdate, Failure: new(AzureBatchTaskState.ExecutorError, new[] { TaskFailureInformationCodes.FailureExitCode, @"1" }), ExecutorExitCode: 1), + new AzureBatchTaskState(AzureBatchTaskState.TaskState.CompletedWithErrors, Failure: new(AzureBatchTaskState.SystemError, new[] { TaskFailureInformationCodes.FailureExitCode, @"1" }), BatchTaskExitCode: 1) + }; + public static AzureBatchTaskState[] NodeDiskFull => new[] { new AzureBatchTaskState(AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, Failure: new("DiskFull", new[] { "Error message." })) }; + public static AzureBatchTaskState[] UploadOrDownloadFailed => new[] { new AzureBatchTaskState(AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed) }; + public static AzureBatchTaskState[] NodeAllocationFailed => new[] { new AzureBatchTaskState(AzureBatchTaskState.TaskState.NodeAllocationFailed, Failure: new(AzureBatchTaskState.TaskState.NodeAllocationFailed.ToString(), new[] { "Error message." })) }; + public static AzureBatchTaskState[] NodePreempted => new[] { new AzureBatchTaskState(AzureBatchTaskState.TaskState.NodePreempted) }; + public static AzureBatchTaskState[] NodeStartTaskFailed => new[] { new AzureBatchTaskState(AzureBatchTaskState.TaskState.NodeStartTaskFailed) }; + public static AzureBatchTaskState[] CancellationRequested => new[] { new AzureBatchTaskState(AzureBatchTaskState.TaskState.CancellationRequested, CloudTaskCreationTime: DateTimeOffset.UtcNow - TimeSpan.FromMinutes(12)) }; } private class AzureProxyReturnValues diff --git a/src/TesApi.Web/AzureBatchTaskState.cs b/src/TesApi.Web/AzureBatchTaskState.cs index da9946a1b..7acd64a67 100644 --- a/src/TesApi.Web/AzureBatchTaskState.cs +++ b/src/TesApi.Web/AzureBatchTaskState.cs @@ -23,6 +23,7 @@ namespace TesApi.Web /// . /// . /// Warning. First item in enumeration is the Warning code, rest of items are additional system log entries. + /// Replace previous logged value of the task start time with . public record class AzureBatchTaskState( TaskState State, IEnumerable OutputFileLogs = default, @@ -34,8 +35,25 @@ public record class AzureBatchTaskState( int? ExecutorExitCode = default, DateTimeOffset? BatchTaskEndTime = default, int? BatchTaskExitCode = default, - IEnumerable Warning = default) + IEnumerable Warning = default, + bool ReplaceBatchTaskStartTime = default) { + /// + /// Unknown error. Either an exception or a missing reason. + /// + public const string UnknownError = @"UnknownError"; + + /// + /// . + /// + public const string ExecutorError = @"ExecutorError"; + + + /// + /// . + /// + public const string SystemError = @"SystemError"; + /// /// TesTask's state /// @@ -132,9 +150,9 @@ protected AzureBatchTaskState(AzureBatchTaskState original, string appendToSyste { Failure = original.Failure switch { - null => new("UnknownError", Enumerable.Empty().Append(appendToSystemLog)), - { SystemLogs: null } => new(original.Failure?.Reason ?? "UnknownError", Enumerable.Empty().Append(appendToSystemLog)), - _ => new(original.Failure?.Reason ?? "UnknownError", original.Failure?.SystemLogs.Append(appendToSystemLog)), + null => new(UnknownError, Enumerable.Empty().Append(appendToSystemLog)), + { SystemLogs: null } => new(original.Failure.Value.Reason ?? UnknownError, Enumerable.Empty().Append(appendToSystemLog)), + _ => new(original.Failure.Value.Reason ?? UnknownError, original.Failure.Value.SystemLogs.Append(appendToSystemLog)), }; } } diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 49e5b59d0..1fbf787ec 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -39,10 +39,10 @@ namespace TesApi.Web public partial class BatchScheduler : IBatchScheduler { /// - /// Time tasks must live before being deleted + /// Minimum lifetime of a . /// // https://learn.microsoft.com/azure/batch/best-practices#manage-task-lifetime - public static TimeSpan BatchDeleteNewTaskWorkaroundTimeSpan = TimeSpan.FromMinutes(10); + public static TimeSpan BatchDeleteNewTaskWorkaroundTimeSpan { get; } = TimeSpan.FromMinutes(10); internal const string PoolHostName = "CoA-TES-HostName"; internal const string PoolIsDedicated = "CoA-TES-IsDedicated"; @@ -184,15 +184,7 @@ public BatchScheduler( async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { { - var newData = new CombinedBatchTaskInfo(batchInfo, false); - if (newData.Failure is null) { newData.AlternateSystemLogItem = null; } - var newDataText = System.Text.Json.JsonSerializer.Serialize( - newData, - new System.Text.Json.JsonSerializerOptions() - { - DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingDefault, - Converters = { new System.Text.Json.Serialization.JsonStringEnumConverter(System.Text.Json.JsonNamingPolicy.CamelCase) } - }); + var newDataText = SerializeToString(new CombinedBatchTaskInfo(batchInfo, false)); if ("{}".Equals(newDataText) && newTaskState == tesTask.State) { @@ -200,35 +192,45 @@ async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, Comb return false; } - logger.LogDebug(@"Setting task {TesTask} with metadata {Metadata}.", tesTask.Id, newDataText); + logger.LogDebug(@"Setting task {TesTask} with metadata {BatchTaskState} {Metadata}.", tesTask.Id, batchInfo.State, newDataText); } var (batchNodeMetrics, taskStartTime, taskEndTime, cromwellRcCode) = newTaskState == TesState.COMPLETEEnum ? await GetBatchNodeMetricsAndCromwellResultCodeAsync(tesTask, cancellationToken) : default; + var taskAsString = SerializeToString(tesTask); + lock (setTaskStateLock) { tesTask.State = newTaskState; var tesTaskLog = tesTask.GetOrAddTesTaskLog(); - var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); + tesTaskLog.BatchNodeMetrics ??= batchNodeMetrics; + tesTaskLog.CromwellResultCode ??= cromwellRcCode; + tesTaskLog.EndTime ??= batchInfo.BatchTaskEndTime ?? taskEndTime; - tesTaskLog.BatchNodeMetrics = batchNodeMetrics; - tesTaskLog.CromwellResultCode = cromwellRcCode; - tesTaskLog.EndTime ??= taskEndTime ?? batchInfo.BatchTaskEndTime; - tesTaskLog.StartTime ??= taskStartTime ?? batchInfo.BatchTaskStartTime; - tesTaskExecutorLog.StartTime ??= batchInfo.ExecutorStartTime; - tesTaskExecutorLog.EndTime ??= batchInfo.ExecutorEndTime; - tesTaskExecutorLog.ExitCode ??= batchInfo.ExecutorExitCode; + if (batchInfo.ExecutorEndTime is not null || batchInfo.ExecutorStartTime is not null || batchInfo.ExecutorExitCode is not null) + { + var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); + tesTaskExecutorLog.StartTime ??= batchInfo.ExecutorStartTime; + tesTaskExecutorLog.EndTime ??= batchInfo.ExecutorEndTime; + tesTaskExecutorLog.ExitCode ??= batchInfo.ExecutorExitCode; + } - if (tesTaskLog.Outputs is null) + if (batchInfo.ReplaceBatchTaskStartTime) { - tesTaskLog.Outputs = batchInfo.OutputFileLogs?.Select(ConvertOutputFileLogToTesOutputFileLog).ToList(); + tesTaskLog.StartTime = batchInfo.BatchTaskStartTime ?? taskStartTime; } - else if (!tesTaskLog.Outputs.Any()) + else + { + tesTaskLog.StartTime ??= batchInfo.BatchTaskStartTime ?? taskStartTime; + } + + if (batchInfo.OutputFileLogs is not null) { - tesTaskLog.Outputs.AddRange(batchInfo.OutputFileLogs?.Select(ConvertOutputFileLogToTesOutputFileLog) ?? Enumerable.Empty()); + tesTaskLog.Outputs ??= new(); + tesTaskLog.Outputs.AddRange(batchInfo.OutputFileLogs.Select(ConvertOutputFileLogToTesOutputFileLog)); } // Only accurate when the task completes successfully, otherwise it's the Batch time as reported from Batch @@ -251,18 +253,14 @@ async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, Comb } } - if (batchInfo.Failure is not null) + if (batchInfo.Failure.HasValue) { - tesTask.SetFailureReason(batchInfo.Failure?.Reason); - - if (batchInfo.Failure?.SystemLogs is not null) - { - tesTask.AddToSystemLog(batchInfo.Failure?.SystemLogs); - } - else if (!string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem)) - { - tesTask.AddToSystemLog(new[] { batchInfo.AlternateSystemLogItem }); - } + tesTask.SetFailureReason( + batchInfo.Failure.Value.Reason, + (batchInfo.Failure.Value.SystemLogs ?? (string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem) + ? Enumerable.Empty() + : Enumerable.Empty().Append(batchInfo.AlternateSystemLogItem)) + ).ToArray()); } } @@ -272,7 +270,7 @@ async Task SetTaskStateAndLog(TesTask tesTask, TesState newTaskState, Comb await taskExecutionScriptingManager.TryUploadServerTesTask(tesTask, "server-tes-task-completed.json", cancellationToken); } - return true; + return !taskAsString.Equals(SerializeToString(tesTask)); Tes.Models.TesOutputFileLog ConvertOutputFileLogToTesOutputFileLog(AzureBatchTaskState.OutputFileLog fileLog) { @@ -283,12 +281,24 @@ Tes.Models.TesOutputFileLog ConvertOutputFileLogToTesOutputFileLog(AzureBatchTas Url = fileLog.Url.AbsoluteUri }; } + + static string SerializeToString(T item) + => System.Text.Json.JsonSerializer.Serialize(item, new System.Text.Json.JsonSerializerOptions() + { + DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingDefault, + Converters = { new System.Text.Json.Serialization.JsonStringEnumConverter(System.Text.Json.JsonNamingPolicy.CamelCase) } + }); } - async Task SetTaskExecutorError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) + async Task SetCompletedWithErrors(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) { - // TODO: check task's log to ensure that the error was actually an executor error. Return the correct error state. - return await SetTaskStateAndLog(tesTask, TesState.EXECUTORERROREnum, batchInfo, cancellationToken); + var newTaskState = tesTask.FailureReason switch + { + AzureBatchTaskState.ExecutorError => TesState.EXECUTORERROREnum, + _ => TesState.SYSTEMERROREnum, + }; + + return await SetTaskStateAndLog(tesTask, newTaskState, batchInfo, cancellationToken); } async Task SetTaskSystemError(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) @@ -309,9 +319,7 @@ Task RequeueTaskAfterFailureAsync(TesTask tesTask, CombinedBatchTaskInfo b : SetTaskStateAfterFailureAsync(tesTask, TesState.QUEUEDEnum, batchInfo, cancellationToken); Task AddSystemLogAndSetTaskSystemErrorAsync(TesTask tesTask, CombinedBatchTaskInfo batchInfo, string additionalSystemLogItem, CancellationToken cancellationToken) - { - return SetTaskSystemError(tesTask, new(batchInfo, additionalSystemLogItem), cancellationToken); - } + => SetTaskSystemError(tesTask, new(batchInfo, additionalSystemLogItem), cancellationToken); bool HandlePreemptedNode(TesTask tesTask, CombinedBatchTaskInfo batchInfo) { @@ -326,24 +334,21 @@ bool HandlePreemptedNode(TesTask tesTask, CombinedBatchTaskInfo batchInfo) return true; } - Task HandleInfoUpdate(TesTask tesTask, CombinedBatchTaskInfo batchInfo, CancellationToken cancellationToken) - { - return SetTaskStateAndLog(tesTask, tesTask.State, batchInfo, cancellationToken); - } + const string alternateSystemLogMissingFailure = @"Please open an issue at https://github.com/microsoft/ga4gh-tes/issues. There should have been a failure reported here."; tesTaskStateTransitions = new List() { new(condition: null, AzureBatchTaskState.TaskState.CancellationRequested, alternateSystemLogItem: null, TerminateBatchTaskAsync), new(tesTaskIsInitializing, AzureBatchTaskState.TaskState.NodeAllocationFailed, alternateSystemLogItem: null, RequeueTaskAfterFailureAsync), - new(tesTaskIsInitializing, AzureBatchTaskState.TaskState.NodeStartTaskFailed, "Please open an issue. There should have been an error reported here.", SetTaskSystemError), + new(tesTaskIsInitializing, AzureBatchTaskState.TaskState.NodeStartTaskFailed, alternateSystemLogMissingFailure, SetTaskSystemError), new(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.Initializing, alternateSystemLogItem: null, (tesTask, info, ct) => SetTaskStateAndLog(tesTask, TesState.INITIALIZINGEnum, info, ct)), new(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.Running, alternateSystemLogItem: null, (tesTask, info, ct) => SetTaskStateAndLog(tesTask, TesState.RUNNINGEnum, info, ct)), new(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedSuccessfully, alternateSystemLogItem: null, (tesTask, info, ct) => SetTaskStateAndLog(tesTask, TesState.COMPLETEEnum, info, ct)), - new(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedWithErrors, "Please open an issue. There should have been an error reported here.", SetTaskExecutorError), - new(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, "Please open an issue. There should have been an error reported here.", SetTaskSystemError), + new(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.CompletedWithErrors, alternateSystemLogMissingFailure, SetCompletedWithErrors), + new(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFailedDuringStartupOrExecution, alternateSystemLogMissingFailure, SetTaskSystemError), new(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodePreempted, alternateSystemLogItem: null, HandlePreemptedNode), - new(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed, alternateSystemLogItem: null, HandleInfoUpdate), - new(condition: null, AzureBatchTaskState.TaskState.InfoUpdate, alternateSystemLogItem: null, HandleInfoUpdate), + new(tesTaskIsInitializingOrRunning, AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed, alternateSystemLogItem: null, (tesTask, info, ct) => SetTaskStateAndLog(tesTask, tesTask.State, info, ct)), + new(condition: null, AzureBatchTaskState.TaskState.InfoUpdate, alternateSystemLogItem: null, (tesTask, info, ct) => SetTaskStateAndLog(tesTask, tesTask.State, info, ct)), }.AsReadOnly(); } @@ -371,11 +376,6 @@ private async Task TerminateBatchTaskAsync(TesTask tesTask, CombinedBatchT { switch (batchInfo.State) { - case AzureBatchTaskState.TaskState.CompletedSuccessfully: - case AzureBatchTaskState.TaskState.CompletedWithErrors: - return false; // It's already finished - - case AzureBatchTaskState.TaskState.CancellationRequested: if (!TesTask.ActiveStates.Contains(tesTask.State)) { @@ -408,7 +408,7 @@ private async Task TerminateBatchTaskAsync(TesTask tesTask, CombinedBatchT try // TODO: remove (and undo changes to taskExecutionScriptingManager) { - await taskExecutionScriptingManager.TryUploadServerTesTask(tesTask, "server-tes-task-completed.json", cancellationToken); + if (TesState.CANCELEDEnum.Equals(tesTask.State)) { await taskExecutionScriptingManager.TryUploadServerTesTask(tesTask, "server-tes-task-completed.json", cancellationToken); } } catch (Exception exc) { @@ -814,7 +814,7 @@ Task HandleException(Exception exception, string poolKey, TesTask tesTask) default: tesTask.State = TesState.SYSTEMERROREnum; - tesTask.SetFailureReason("UnknownError", $"{exception?.GetType().FullName}: {exception?.Message}", exception?.StackTrace); + tesTask.SetFailureReason(AzureBatchTaskState.UnknownError, $"{exception?.GetType().FullName}: {exception?.Message}", exception?.StackTrace); logger.LogError(exception, "TES task: {TesTask} Exception: {ExceptionType}: {ExceptionMessage}", tesTask.Id, exception?.GetType().FullName, exception?.Message); break; } @@ -1425,8 +1425,8 @@ private static Dictionary DelimitedTextToDictionary(string text, /// public async IAsyncEnumerable GetEventMessagesAsync([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken, string @event) { - const string eventsFolderName = "events"; - var prefix = eventsFolderName + "/"; + const string eventsFolderName = "events/"; + var prefix = eventsFolderName; if (!string.IsNullOrWhiteSpace(@event)) { @@ -1435,7 +1435,7 @@ public async IAsyncEnumerable GetEventMessagesAsync([System var tesInternalSegments = StorageAccountUrlSegments.Create(storageAccessProvider.GetInternalTesBlobUrlWithoutSasToken(string.Empty)); var eventsStartIndex = (string.IsNullOrEmpty(tesInternalSegments.BlobName) ? string.Empty : (tesInternalSegments.BlobName + "/")).Length; - var eventsEndIndex = eventsStartIndex + eventsFolderName.Length + 1; + var eventsEndIndex = eventsStartIndex + eventsFolderName.Length; await foreach (var blobItem in azureProxy.ListBlobsWithTagsAsync( new(await storageAccessProvider.GetInternalTesBlobUrlAsync( @@ -1519,7 +1519,7 @@ public async ValueTask ActionAsync(TesTask tesTask, AzureBatchTaskState ba private record CombinedBatchTaskInfo : AzureBatchTaskState { /// - /// Copy constructor that defaults (to enable hiding if serialized) + /// Copy constructor that defaults (to enable hiding when serialized) /// /// to copy /// Parameter that exists to not override the default copy constructor diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index 58b3a1da0..d82421158 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -236,7 +236,7 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes var nodeMessage = message.RunnerEventMessage; logger.LogDebug("Getting batch task state from event {EventName} for {TesTask}.", nodeMessage.Name ?? message.Event, nodeMessage.EntityId); - var state = (nodeMessage.Name ?? message.Event) switch + return (nodeMessage.Name ?? message.Event) switch { Tes.Runner.Events.EventsPublisher.DownloadStartEvent => new AzureBatchTaskState(AzureBatchTaskState.TaskState.Initializing, BatchTaskStartTime: nodeMessage.Created), @@ -247,7 +247,7 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes Tes.Runner.Events.EventsPublisher.FailedStatus => new( AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed, - Failure: new("SystemError", + Failure: new(AzureBatchTaskState.SystemError, Enumerable.Empty() .Append("Download failed.") .Append(nodeMessage.EventData["errorMessage"]) @@ -268,7 +268,7 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes Tes.Runner.Events.EventsPublisher.FailedStatus => new( AzureBatchTaskState.TaskState.InfoUpdate, - Failure: new("ExecutorError", + Failure: new(AzureBatchTaskState.ExecutorError, Enumerable.Empty() .Append(nodeMessage.EventData["errorMessage"]) .Concat(await AddProcessLogsIfAvailable(nodeMessage, tesTask, cancellationToken))), @@ -288,7 +288,7 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes Tes.Runner.Events.EventsPublisher.FailedStatus => new( AzureBatchTaskState.TaskState.NodeFilesUploadOrDownloadFailed, - Failure: new("SystemError", + Failure: new(AzureBatchTaskState.SystemError, Enumerable.Empty() .Append("Upload failed.") .Append(nodeMessage.EventData["errorMessage"]) @@ -301,15 +301,17 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes { Tes.Runner.Events.EventsPublisher.SuccessStatus => new( AzureBatchTaskState.TaskState.CompletedSuccessfully, + ReplaceBatchTaskStartTime: true, BatchTaskStartTime: nodeMessage.Created - TimeSpan.Parse(nodeMessage.EventData["duration"]), BatchTaskEndTime: nodeMessage.Created), Tes.Runner.Events.EventsPublisher.FailedStatus => new( AzureBatchTaskState.TaskState.CompletedWithErrors, - Failure: new("SystemError", + Failure: new(AzureBatchTaskState.SystemError, Enumerable.Empty() .Append("Node script failed.") .Append(nodeMessage.EventData["errorMessage"])), + ReplaceBatchTaskStartTime: true, BatchTaskStartTime: nodeMessage.Created - TimeSpan.Parse(nodeMessage.EventData["duration"]), BatchTaskEndTime: nodeMessage.Created), @@ -319,17 +321,15 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes _ => throw new System.Diagnostics.UnreachableException(), }; - return state; - // Helpers static IEnumerable GetOutputFileLogs(IDictionary eventData) { - if (eventData is null || !eventData.ContainsKey("fileLog-Count")) + if (eventData is null || !eventData.TryGetValue("fileLog-Count", out var fileCount)) { yield break; } - var numberOfFiles = int.Parse(eventData["fileLog-Count"], System.Globalization.CultureInfo.InvariantCulture); + var numberOfFiles = int.Parse(fileCount, System.Globalization.CultureInfo.InvariantCulture); for (var i = 0; i < numberOfFiles; ++i) { @@ -369,7 +369,7 @@ async IAsyncEnumerable GetProcessLogs(Tes.Runner.Events.EventMessage mes await foreach (var uri in azureProxy.ListBlobsAsync(new(await storageAccessProvider.GetInternalTesTaskBlobUrlAsync(tesTask, string.Empty, Azure.Storage.Sas.BlobSasPermissions.List, cancellationToken)), cancellationToken) .Where(blob => blob.BlobName.EndsWith(".txt") && blob.BlobName.Split('/').Last().StartsWith(blobNameStartsWith)) - .OrderBy(blob => blob.BlobName) // Not perfect ordering, but reasonable. This is more likely to be read by people rather then machines. Perfect would involve regex. + .OrderBy(blob => blob.BlobName) // Not perfect ordering, but reasonable. The final results are more likely to be interpreted by people rather then machines. Perfect would involve regex. .Select(blob => blob.BlobUri) .WithCancellation(cancellationToken)) { diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 6d7135118..0389130d8 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -175,7 +175,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< { tesTask.State = TesState.SYSTEMERROREnum; tesTask.EndTime = DateTimeOffset.UtcNow; - tesTask.SetFailureReason("UnknownError", exc.Message, exc.StackTrace); + tesTask.SetFailureReason(AzureBatchTaskState.UnknownError, exc.Message, exc.StackTrace); } if (exc is Microsoft.Azure.Batch.Common.BatchException batchException) @@ -261,7 +261,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< // else // { // logger.LogError(exc, $"Updating TES Task '{tesTask.Id}' threw {exc.GetType().FullName}: '{exc.Message}'. Stack trace: {exc.StackTrace}"); - // currentTesTask?.SetWarning("UnknownError", exc.Message, exc.StackTrace); + // currentTesTask?.SetWarning(AzureBatchTaskState.UnknownError, exc.Message, exc.StackTrace); // } // if (currentTesTask is not null) diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index aac248f46..e57c77b5d 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -285,8 +285,9 @@ private async IAsyncEnumerable GetCloudTaskStatesAsync( static AzureBatchTaskState.FailureInformation ParseComputeNodeErrors(IReadOnlyList nodeErrors) { - var totalList = nodeErrors.Select(nodeError => Enumerable.Empty().Append(nodeError.Code).Append(nodeError.Message) - .Concat(nodeError.ErrorDetails.Select(errorDetail => Enumerable.Empty().Append(errorDetail.Name).Append(errorDetail.Value)).SelectMany(s => s))) + var totalList = nodeErrors.Select(nodeError => Enumerable.Empty() + .Append(nodeError.Code).Append(nodeError.Message) + .Concat(nodeError.ErrorDetails.Select(FormatNameValuePair))) .SelectMany(s => s).ToList(); if (totalList.Contains(TaskFailureInformationCodes.DiskFull)) @@ -321,13 +322,13 @@ async IAsyncEnumerable GetFailures([System.Runtime.Compiler AzureBatchTaskState ConvertFromResize(ResizeError failure) => new(AzureBatchTaskState.TaskState.NodeAllocationFailed, Failure: new(failure.Code, Enumerable.Empty() .Append(failure.Message) - .Concat(failure.Values.Select(t => t.Value)))); + .Concat(failure.Values.Select(FormatNameValuePair)))); AzureBatchTaskState ConvertFromStartTask(TaskFailureInformation failure) => new(AzureBatchTaskState.TaskState.NodeStartTaskFailed, Failure: new(failure.Code, Enumerable.Empty() .Append(failure.Message) .Append($"Start task failed ({failure.Category})") - .Concat(failure.Details.Select(t => t.Value)))); + .Concat(failure.Details.Select(FormatNameValuePair)))); ResizeError PopNextResizeError() => pool.ResizeErrors.TryDequeue(out var resizeError) ? resizeError : default; @@ -352,7 +353,7 @@ AzureBatchTaskState GetCompletedBatchState(CloudTask task) Enumerable.Empty() .Append(task.ExecutionInformation.FailureInformation.Message) .Append($"Batch task ExitCode: {task.ExecutionInformation?.ExitCode}, Failure message: {task.ExecutionInformation?.FailureInformation?.Message}") - .Concat(task.ExecutionInformation.FailureInformation.Details.Select(pair => pair.Value))), + .Concat(task.ExecutionInformation.FailureInformation.Details.Select(FormatNameValuePair))), BatchTaskStartTime: task.ExecutionInformation.StartTime, BatchTaskEndTime: task.ExecutionInformation.EndTime, BatchTaskExitCode: task.ExecutionInformation.ExitCode), @@ -360,6 +361,9 @@ AzureBatchTaskState GetCompletedBatchState(CloudTask task) _ => throw new System.Diagnostics.UnreachableException(), }; } + + static string FormatNameValuePair(NameValuePair pair) + => $"{pair.Name}: {pair.Value}"; } } } From 49a4508174fe61c060cd24731a3ae591dd8eb787 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 16 Nov 2023 18:37:19 -0800 Subject: [PATCH 093/202] some more cleanup --- src/TesApi.Web/BatchScheduler.cs | 8 +- .../Events/RunnerEventsProcessor.cs | 94 +++++++---- .../OrchestrateOnBatchSchedulerServiceBase.cs | 70 +++++---- src/TesApi.Web/PoolScheduler.cs | 146 +++++++++--------- src/TesApi.Web/TaskScheduler.cs | 123 ++++++++------- 5 files changed, 247 insertions(+), 194 deletions(-) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 1fbf787ec..274518855 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -1399,8 +1399,8 @@ static bool TryGetValueAsDouble(Dictionary dict, string key, out VmCpuModelName = metrics.GetValueOrDefault("VmCpuModelName") }; - taskStartTime = TryGetValueAsDateTimeOffset(metrics, "BlobXferPullStart", out var startTime) ? (DateTimeOffset?)startTime : null; - taskEndTime = TryGetValueAsDateTimeOffset(metrics, "UploadEnd", out var endTime) ? (DateTimeOffset?)endTime : null; + taskStartTime = TryGetValueAsDateTimeOffset(metrics, "BlobXferPullStart", out var startTime) ? startTime : null; + taskEndTime = TryGetValueAsDateTimeOffset(metrics, "UploadEnd", out var endTime) ? endTime : null; } catch (Exception ex) { @@ -1417,8 +1417,8 @@ static bool TryGetValueAsDouble(Dictionary dict, string key, out } private static Dictionary DelimitedTextToDictionary(string text, string fieldDelimiter = "=", string rowDelimiter = "\n") - => text.Split(rowDelimiter) - .Select(line => { var parts = line.Split(fieldDelimiter); return new KeyValuePair(parts[0], parts[1]); }) + => text.Split(rowDelimiter, StringSplitOptions.RemoveEmptyEntries) + .Select(line => { var parts = line.Split(fieldDelimiter, 2); return new KeyValuePair(parts[0].Trim(), parts.Length < 2 ? string.Empty : parts[1]); }) .ToDictionary(kv => kv.Key, kv => kv.Value); diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index d82421158..5aceb7715 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -91,7 +91,7 @@ public void ValidateMessageMetadata(RunnerEventsMessage message) /// /// Tes runner event message metadata. /// A for controlling the lifetime of the asynchronous operation. - /// Validation exceptions. + /// Validation exceptions. /// A containing the associated . /// This method assumes was successfully validated by . public async Task DownloadAndValidateMessageContentAsync(RunnerEventsMessage message, CancellationToken cancellationToken) @@ -102,84 +102,105 @@ public async Task DownloadAndValidateMessageContentAsync(Ru { var messageText = await azureProxy.DownloadBlobAsync(message.BlobUri, cancellationToken); content = System.Text.Json.JsonSerializer.Deserialize(messageText) - ?? throw new InvalidOperationException("Deserialize() returned null."); + ?? throw new DownloadOrParseException("Deserialize() returned null."); } catch (Exception ex) { - throw new InvalidOperationException($"Event message blob is malformed. {ex.GetType().FullName}:{ex.Message}", ex); + throw new DownloadOrParseException($"Event message blob is malformed. {ex.GetType().FullName}:{ex.Message}", ex); } message = new(message, content); // Validate content - Assert(Guid.TryParse(content.Id, out _), + Validate(Guid.TryParse(content.Id, out _), $"{nameof(content.Id)}('{content.Id}') is malformed."); - Assert(Tes.Runner.Events.EventsPublisher.EventVersion.Equals(content.EventVersion, StringComparison.Ordinal), + Validate(Tes.Runner.Events.EventsPublisher.EventVersion.Equals(content.EventVersion, StringComparison.Ordinal), $"{nameof(content.EventVersion)}('{content.EventVersion}') is not recognized."); - Assert(Tes.Runner.Events.EventsPublisher.EventDataVersion.Equals(content.EventDataVersion, StringComparison.Ordinal), + Validate(Tes.Runner.Events.EventsPublisher.EventDataVersion.Equals(content.EventDataVersion, StringComparison.Ordinal), $"{nameof(content.EventDataVersion)}('{content.EventDataVersion}') is not recognized."); - Assert(Tes.Runner.Events.EventsPublisher.TesTaskRunnerEntityType.Equals(content.EntityType, StringComparison.Ordinal), + Validate(Tes.Runner.Events.EventsPublisher.TesTaskRunnerEntityType.Equals(content.EntityType, StringComparison.Ordinal), $"{nameof(content.EntityType)}('{content.EntityType}') is not recognized."); - Assert(message.TesTaskId.Equals(content.EntityId, StringComparison.Ordinal), + Validate(message.TesTaskId.Equals(content.EntityId, StringComparison.Ordinal), $"{nameof(content.EntityId)}('{content.EntityId}') does not match the expected value of '{message.TesTaskId}'."); - Assert(message.Tags["task-id"].Equals(content.EntityId, StringComparison.Ordinal), + Validate(message.Tags["task-id"].Equals(content.EntityId, StringComparison.Ordinal), $"{nameof(content.EntityId)}('{content.EntityId}') does not match the expected value of '{message.Tags["task-id"]}' from the tags.."); - Assert(message.Event.Equals(content.Name, StringComparison.OrdinalIgnoreCase), + Validate(message.Event.Equals(content.Name, StringComparison.OrdinalIgnoreCase), $"{nameof(content.Name)}('{content.Name}') does not match the expected value of '{message.Event}' from the blob path."); - Assert(message.Tags["event-name"].Equals(content.Name, StringComparison.Ordinal), + Validate(message.Tags["event-name"].Equals(content.Name, StringComparison.Ordinal), $"{nameof(content.Name)}('{content.Name}') does not match the expected value of '{message.Tags["event-name"]}' from the tags."); - // Event type specific validations + // Event type specific content validations switch (content.Name) { case Tes.Runner.Events.EventsPublisher.DownloadStartEvent: - Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(content.StatusMessage, StringComparison.Ordinal), + Validate(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(content.StatusMessage, StringComparison.Ordinal), $"{nameof(content.StatusMessage)}('{content.StatusMessage}') does not match the expected value of '{Tes.Runner.Events.EventsPublisher.StartedStatus}'."); + ValidateCreated(); break; case Tes.Runner.Events.EventsPublisher.DownloadEndEvent: - Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(content.StatusMessage), + Validate(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(content.StatusMessage), $"{nameof(content.StatusMessage)}('{content.StatusMessage}') does not match one of the expected valued of '{Tes.Runner.Events.EventsPublisher.SuccessStatus}' or '{Tes.Runner.Events.EventsPublisher.FailedStatus}'."); + ValidateFailedStatus(); break; case Tes.Runner.Events.EventsPublisher.UploadStartEvent: - Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(content.StatusMessage, StringComparison.Ordinal), + Validate(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(content.StatusMessage, StringComparison.Ordinal), $"{nameof(content.StatusMessage)}('{content.StatusMessage}') does not match the expected value of '{Tes.Runner.Events.EventsPublisher.StartedStatus}'."); break; case Tes.Runner.Events.EventsPublisher.UploadEndEvent: - Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(content.StatusMessage), + Validate(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(content.StatusMessage), $"{nameof(content.StatusMessage)}('{content.StatusMessage}') does not match one of the expected valued of '{Tes.Runner.Events.EventsPublisher.SuccessStatus}' or '{Tes.Runner.Events.EventsPublisher.FailedStatus}'."); + ValidateFailedStatus(); break; case Tes.Runner.Events.EventsPublisher.ExecutorStartEvent: - Assert(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(content.StatusMessage, StringComparison.Ordinal), + Validate(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(content.StatusMessage, StringComparison.Ordinal), $"{nameof(content.StatusMessage)}('{content.StatusMessage}') does not match the expected value of '{Tes.Runner.Events.EventsPublisher.StartedStatus}'."); + ValidateCreated(); break; case Tes.Runner.Events.EventsPublisher.ExecutorEndEvent: - Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(content.StatusMessage), + Validate(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(content.StatusMessage), $"{nameof(content.StatusMessage)}('{content.StatusMessage}') does not match one of the expected valued of '{Tes.Runner.Events.EventsPublisher.SuccessStatus}' or '{Tes.Runner.Events.EventsPublisher.FailedStatus}'."); + ValidateFailedStatus(); + ValidateCreated(); + Validate(content.EventData.ContainsKey("exitCode"), $"{nameof(content.Name)}('{content.Name}') does not contain 'exitCode'"); break; case Tes.Runner.Events.EventsPublisher.TaskCompletionEvent: - Assert(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(content.StatusMessage), + Validate(new[] { Tes.Runner.Events.EventsPublisher.SuccessStatus, Tes.Runner.Events.EventsPublisher.FailedStatus }.Contains(content.StatusMessage), $"{nameof(content.StatusMessage)}('{content.StatusMessage}') does not match one of the expected valued of '{Tes.Runner.Events.EventsPublisher.SuccessStatus}' or '{Tes.Runner.Events.EventsPublisher.FailedStatus}'."); + ValidateFailedStatus(); + ValidateCreated(); + Validate(content.EventData.ContainsKey("duration"), $"{nameof(content.Name)}('{content.Name}') does not contain 'duration'"); break; default: - Assert(false, $"{nameof(content.Name)}('{content.Name}') is not recognized."); + Validate(false, $"{nameof(content.Name)}('{content.Name}') is not recognized."); break; } return message; - static void Assert([System.Diagnostics.CodeAnalysis.DoesNotReturnIf(false)] bool condition, string message) + void ValidateFailedStatus() + { + if (Tes.Runner.Events.EventsPublisher.FailedStatus.Equals(content.StatusMessage)) + { + Validate(content.EventData.ContainsKey("errorMessage"), $"{nameof(content.Name)}('{content.Name}' with {nameof(Tes.Runner.Events.EventsPublisher.FailedStatus)}) does not contain 'errorMessage'"); + } + } + + void ValidateCreated() + => Validate(content.Created != default, $"{nameof(content.Name)}('{content.Name}') {nameof(content.Created)} was not set."); + + static void Validate([System.Diagnostics.CodeAnalysis.DoesNotReturnIf(false)] bool condition, string message) { if (!condition) { - throw new AssertException(message); + throw new DownloadOrParseException(message); } } } @@ -379,7 +400,7 @@ async IAsyncEnumerable GetProcessLogs(Tes.Runner.Events.EventMessage mes } /// - /// Marks this event message processed. + /// Marks the event message as processed. /// /// Tes runner event message metadata. /// A for controlling the lifetime of the asynchronous operation. @@ -396,14 +417,31 @@ await azureProxy.SetBlobTags( } /// - /// Validation assert failed. + /// Prevents the message from being reprocessed by removing the 'task-id' tag. Used for malformed blobs. /// - public class AssertException : InvalidOperationException + /// Tes runner event message metadata. + /// A for controlling the lifetime of the asynchronous operation. + /// + /// This method assumes 's and are intact and correct. + public async Task RemoveMessageFromReattemptsAsync(RunnerEventsMessage message, CancellationToken cancellationToken) + { + message.Tags.Remove("task-id"); + await azureProxy.SetBlobTags( + message.BlobUri, + message.Tags, + cancellationToken); + } + + /// + /// The exception that is thrown when an event message is malformed. + /// + public class DownloadOrParseException : InvalidOperationException { /// - public AssertException(string message) : base(message) - { - } + public DownloadOrParseException(string message) : base(message) { } + + /// + public DownloadOrParseException(string message, Exception exception) : base(message, exception) { } } } } diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 0389130d8..648f9f5aa 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -15,9 +15,7 @@ namespace TesApi.Web { /// - /// A background service template that schedules TES tasks in the batch system, orchestrates their lifecycle, and updates their state. - /// This should only be used to build system-wide singleton services. This class does not support scale-out on multiple machines, - /// nor does it implement a leasing mechanism. In the future, consider using the Lease Blob operation. + /// Common functionality for s that perform operations in the batch system, including common functionality for services using . /// internal abstract class OrchestrateOnBatchSchedulerServiceBase : BackgroundService { @@ -30,9 +28,9 @@ internal abstract class OrchestrateOnBatchSchedulerServiceBase : BackgroundServi /// Default constructor /// /// Used for requesting termination of the current application. Pass null to allow this service to stop during initialization without taking down the application. - /// The main TES task database repository implementation - /// The batch scheduler implementation - /// The logger instance + /// The main TES task database repository implementation. + /// The batch scheduler implementation. + /// The logger instance. protected OrchestrateOnBatchSchedulerServiceBase(IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger logger) { this.hostApplicationLifetime = hostApplicationLifetime; @@ -44,7 +42,7 @@ protected OrchestrateOnBatchSchedulerServiceBase(IHostApplicationLifetime hostAp /// /// Prepends the log message with the ultimately derived class's name. /// - /// + /// Log message tail. /// prepended with the class name. protected string MarkLogMessage(string message) { @@ -53,16 +51,18 @@ protected string MarkLogMessage(string message) /// [System.Diagnostics.CodeAnalysis.SuppressMessage("Usage", "CA2254:Template should be a static expression", Justification = "Used to provide service's name in log message.")] - public override Task StopAsync(CancellationToken cancellationToken) + public override async Task StopAsync(CancellationToken cancellationToken) { logger.LogInformation(MarkLogMessage("stopping...")); - return base.StopAsync(cancellationToken); + await base.StopAsync(cancellationToken); } /// [System.Diagnostics.CodeAnalysis.SuppressMessage("Usage", "CA2254:Template should be a static expression", Justification = "Used to provide service's name in log message.")] protected sealed override async Task ExecuteAsync(CancellationToken stoppingToken) { + logger.LogInformation(MarkLogMessage("starting...")); + try { // The order of these two calls is critical. @@ -86,33 +86,34 @@ protected sealed override async Task ExecuteAsync(CancellationToken stoppingToke /// This method is called when the starts. The implementation should return a task that represents /// the lifetime of the long running operation(s) being performed. /// - /// Triggered when is called. - /// A that represents the long running operations. + /// Triggered when is called. + /// A that represents the long running operations. /// See Worker Services in .NET for implementation guidelines. - protected abstract Task ExecuteCoreAsync(CancellationToken stoppingToken); + protected abstract ValueTask ExecuteCoreAsync(CancellationToken cancellationToken); /// /// This method is called right before . It can be used to prepare the service or the system before the service's operations begin. /// - /// Triggered when is called. + /// Triggered when is called. /// A that represents this method's operations. - protected virtual Task ExecuteSetupAsync(CancellationToken stoppingToken) => Task.CompletedTask; + /// This method runs right after . + protected virtual ValueTask ExecuteSetupAsync(CancellationToken cancellationToken) => ValueTask.CompletedTask; /// /// This method is called right before . It can be used to prepare the service or the system before the service's operations begin. /// - /// Triggered when is called. + /// Triggered when is called. /// This method's lifetime will delay the exit of in the base class, thus delaying the start of subsequent services in the system. - protected virtual void ExecuteSetup(CancellationToken stoppingToken) { } + protected virtual void ExecuteSetup(CancellationToken cancellationToken) { } /// /// Runs repeatedly at an interval of . /// /// Interval to rerun . /// Action to repeatedly run. - /// Triggered when is called. - /// A System.Threading.Tasks.Task that represents the long running operations. - protected async Task ExecuteActionOnIntervalAsync(TimeSpan runInterval, Func action, CancellationToken stoppingToken) + /// Triggered when is called. + /// A that represents this method's operations. + protected async ValueTask ExecuteActionOnIntervalAsync(TimeSpan runInterval, Func action, CancellationToken cancellationToken) { ArgumentNullException.ThrowIfNull(action); @@ -124,9 +125,9 @@ protected async Task ExecuteActionOnIntervalAsync(TimeSpan runInterval, Func - /// Retrieves provided actionable TES tasks from the database using , performs an action in the batch system using , and updates the resultant state + /// Retrieves provided actionable TES tasks from the database using , performs an action in the batch system using , and updates the resultant state in the repository. /// - /// A System.Threading.Tasks.ValueTask that represents the long running operations. - protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func>> tesTaskGetter, Func>> tesTaskProcessor, CancellationToken stoppingToken, string unitsLabel = "tasks") + /// Tag to disambiguate the state and/or action workflow performed in log messages. + /// Provides array of s on which to perform actions through . + /// Method operating on returning indicating if each needs updating into the repository. + /// A for controlling the lifetime of the asynchronous operation. + /// Tag to indicate the underlying unit quantity of items processed in log messages. + /// A that represents this method's operations. + protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func>> tesTaskGetter, Func>> tesTaskProcessor, CancellationToken cancellationToken, string unitsLabel = "tasks") { - var tesTasks = await (await tesTaskGetter(stoppingToken)).ToArrayAsync(stoppingToken); + var tesTasks = await (await tesTaskGetter(cancellationToken)).ToArrayAsync(cancellationToken); if (tesTasks.All(task => task is null)) { @@ -157,7 +163,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< var startTime = DateTime.UtcNow; - await foreach (var tesTaskTask in tesTaskProcessor(tesTasks, stoppingToken).WithCancellation(stoppingToken)) + await foreach (var tesTaskTask in tesTaskProcessor(tesTasks, cancellationToken).WithCancellation(cancellationToken)) { var tesTask = tesTaskTask.Related; @@ -204,7 +210,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< } logger.LogError(exc, "TES task: {TesTask} threw an exception in OrchestrateTesTasksOnBatch({Poll}).", tesTask.Id, pollName); - await repository.UpdateItemAsync(tesTask, stoppingToken); + await repository.UpdateItemAsync(tesTask, cancellationToken); } if (isModified) @@ -239,7 +245,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< logger.LogDebug("{TesTask} failed, state: {TesTaskState}, reason: {TesTaskFailureReason}", tesTask.Id, tesTask.State, tesTask.FailureReason); } - await repository.UpdateItemAsync(tesTask, stoppingToken); + await repository.UpdateItemAsync(tesTask, cancellationToken); } } catch (RepositoryCollisionException exc) @@ -277,8 +283,8 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< if (batchScheduler.NeedPoolFlush) { - var pools = (await repository.GetItemsAsync(task => task.State == TesState.INITIALIZINGEnum || task.State == TesState.RUNNINGEnum, stoppingToken)).Select(task => task.PoolId).Distinct(); - await batchScheduler.FlushPoolsAsync(pools, stoppingToken); + var pools = (await repository.GetItemsAsync(task => task.State == TesState.INITIALIZINGEnum || task.State == TesState.RUNNINGEnum, cancellationToken)).Select(task => task.PoolId).Distinct(); + await batchScheduler.FlushPoolsAsync(pools, cancellationToken); } logger.LogDebug("OrchestrateTesTasksOnBatch({Poll}) for {TaskCount} {UnitsLabel} completed in {TotalSeconds} seconds.", pollName, tesTasks.Where(task => task is not null).Count(), unitsLabel, DateTime.UtcNow.Subtract(startTime).TotalSeconds); diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index e57c77b5d..905f01cac 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -17,7 +17,7 @@ namespace TesApi.Web { /// - /// A background service that montitors CloudPools in the batch system, orchestrates their lifecycle, and updates their state. + /// A background service that montitors s in the batch system, orchestrates their lifecycle, and updates their state. /// This should only be used as a system-wide singleton service. This class does not support scale-out on multiple machines, /// nor does it implement a leasing mechanism. In the future, consider using the Lease Blob operation. /// @@ -31,37 +31,63 @@ internal class PoolScheduler : OrchestrateOnBatchSchedulerServiceBase private static readonly TimeSpan StateTransitionTimeForDeletionTimeSpan = 0.75 * BatchScheduler.BatchDeleteNewTaskWorkaroundTimeSpan; private static readonly TimeSpan CompletedTaskListTimeSpan = 0.5 * BatchScheduler.BatchDeleteNewTaskWorkaroundTimeSpan; + /// + /// Predicate to obtain s (recently) running on s. Used to connect tasks and nodes together. + /// + /// Shared between , , and to limit Batch API requests to a minimum. + internal static bool TaskListWithComputeNodeInfoPredicate(CloudTask task) => !TaskState.Completed.Equals(task.State) && !string.IsNullOrEmpty(task.ComputeNodeInformation?.ComputeNodeId); + + /// + /// Predicate to obtain s pending in Azure Batch. + /// + /// Shared between and . + private static bool ActiveTaskListPredicate(CloudTask task) => TaskState.Active.Equals(task.State); + + /// + /// Predicate used to obtain s to backstop completing s in case of problems with the . + /// + /// Shared between and . + private static bool CompletedTaskListPredicate(CloudTask task, DateTime now) => TaskState.Completed.Equals(task.State) && task.StateTransitionTime < now - CompletedTaskListTimeSpan; + /// /// Default constructor /// /// Used for requesting termination of the current application during initialization. - /// The main TES task database repository implementation - /// - /// + /// The main TES task database repository implementation. + /// The batch scheduler implementation. + /// The logger instance. /// public PoolScheduler(Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger logger) : base(hostApplicationLifetime, repository, batchScheduler, logger) { } /// - protected override void ExecuteSetup(CancellationToken stoppingToken) + protected override void ExecuteSetup(CancellationToken cancellationToken) { - batchScheduler.LoadExistingPoolsAsync(stoppingToken).Wait(stoppingToken); // Delay starting Scheduler until this completes to finish initializing BatchScheduler. + batchScheduler.LoadExistingPoolsAsync(cancellationToken).Wait(cancellationToken); // Delay starting TaskScheduler until this completes to finish initializing the shared parts of BatchScheduler. } /// - protected override Task ExecuteCoreAsync(CancellationToken stoppingToken) + protected override ValueTask ExecuteCoreAsync(CancellationToken cancellationToken) { - return ServiceBatchPoolsAsync(stoppingToken); + return ExecuteActionOnIntervalAsync( + RunInterval, + async token => await ExecuteActionOnPoolsAsync( + async (pool, token) => + { + await pool.ServicePoolAsync(token); + await ProcessTasksAsync(pool, DateTime.UtcNow, pool.ListCloudTasksAsync(), token); + }, + token), + cancellationToken); } /// /// Performs an action on each batch pool. /// - /// - /// - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// Method performing operations on a . + /// A for controlling the lifetime of the asynchronous operation. /// - private async ValueTask ExecuteActionOnPoolsAsync(string pollName, Func action, CancellationToken stoppingToken) + private async ValueTask ExecuteActionOnPoolsAsync(Func action, CancellationToken cancellationToken) { ArgumentNullException.ThrowIfNull(action); @@ -74,7 +100,7 @@ private async ValueTask ExecuteActionOnPoolsAsync(string pollName, Func + await Parallel.ForEachAsync(pools, cancellationToken, async (pool, token) => { try { @@ -82,41 +108,20 @@ await Parallel.ForEachAsync(pools, stoppingToken, async (pool, token) => } catch (Exception exc) { - logger.LogError(exc, @"Batch pool {PoolId} threw an exception in {Poll}.", pool.Id, pollName); + logger.LogError(exc, @"Batch pool {PoolId} threw an exception when serviced.", pool.Id); } }); - logger.LogDebug(@"{Poll} for {PoolsCount} pools completed in {TotalSeconds} seconds.", pollName, pools.Count, DateTime.UtcNow.Subtract(startTime).TotalSeconds); - } - - /// - /// Repeatedly services all batch pools associated with this TES instance, including updating tasks. - /// - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. - /// - private Task ServiceBatchPoolsAsync(CancellationToken stoppingToken) - { - return ExecuteActionOnIntervalAsync( - RunInterval, - async token => - await ExecuteActionOnPoolsAsync( - "Service Batch Pools", - async (pool, cancellationToken) => - { - await pool.ServicePoolAsync(cancellationToken); - await ProcessTasksAsync(pool, DateTime.UtcNow, pool.ListCloudTasksAsync(), cancellationToken); - }, - token), - stoppingToken); + logger.LogDebug(@"Service Batch Pools for {PoolsCount} pools completed in {TotalSeconds} seconds.", pools.Count, DateTime.UtcNow.Subtract(startTime).TotalSeconds); } /// - /// Processes tasks to manage state. + /// Processes tasks connected to a pool to manage state. /// - /// + /// The associated with . /// Reference time. - /// - /// + /// s requiring attention. + /// A for controlling the lifetime of the asynchronous operation. /// private async ValueTask ProcessTasksAsync(IBatchPool pool, DateTime now, IAsyncEnumerable tasks, CancellationToken cancellationToken) { @@ -145,27 +150,12 @@ private async ValueTask ProcessTasksAsync(IBatchPool pool, DateTime now, IAsyncE await ProcessDeletedTasks(deletionCandidateTasks, cancellationToken); } - /// - /// Shared between and . - /// - internal static bool TaskListWithComputeNodeInfoPredicate(CloudTask task) => !TaskState.Completed.Equals(task.State) && !string.IsNullOrEmpty(task.ComputeNodeInformation?.ComputeNodeId); - - /// - /// Shared between and . - /// - private static bool ActiveTaskListPredicate(CloudTask task) => TaskState.Active.Equals(task.State); - - /// - /// Shared between and . - /// - private static bool CompletedTaskListPredicate(CloudTask task, DateTime now) => TaskState.Completed.Equals(task.State) && task.StateTransitionTime < now - CompletedTaskListTimeSpan; - /// /// Updates each task based on the provided states. /// - /// The batch pool/job from which the state was obtained. - /// The states with which to update the associated tes tasks. - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// The batch pool from which the state was obtained. + /// The states with which to update the associated s. + /// A for controlling the lifetime of the asynchronous operation. /// private async ValueTask ProcessCloudTaskStatesAsync(string poolId, IAsyncEnumerable states, CancellationToken cancellationToken) { @@ -201,10 +191,10 @@ await OrchestrateTesTasksOnBatchAsync( } /// - /// Deletes cloud tasks. + /// Deletes completed s. /// /// Tasks to delete. - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. + /// A for controlling the lifetime of the asynchronous operation. /// private async ValueTask ProcessDeletedTasks(IAsyncEnumerable tasks, CancellationToken cancellationToken) { @@ -231,12 +221,12 @@ private async ValueTask ProcessDeletedTasks(IAsyncEnumerable - /// Obtains for tasks in specific conditions. + /// Obtains for updating s. /// - /// - /// - /// - /// + /// The associated with . + /// Reference time. + /// s which need s for further processing. + /// A for controlling the lifetime of the asynchronous operation. /// private async IAsyncEnumerable GetCloudTaskStatesAsync(IBatchPool pool, DateTime now, IAsyncEnumerable tasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { @@ -247,7 +237,7 @@ private async IAsyncEnumerable GetCloudTaskStatesAsync( { var taskList = await tasks.ToListAsync(cancellationToken); taskListWithComputeNodeInfo = taskList.Where(TaskListWithComputeNodeInfoPredicate).ToList(); - activeTaskList = taskList.Where(ActiveTaskListPredicate).OrderByDescending(task => task.StateTransitionTime).ToList(); + activeTaskList = taskList.Where(ActiveTaskListPredicate).OrderByDescending(task => task.StateTransitionTime?.ToUniversalTime()).ToList(); completedTaskList = taskList.Where(task => CompletedTaskListPredicate(task, now)).ToList(); } @@ -269,8 +259,8 @@ private async IAsyncEnumerable GetCloudTaskStatesAsync( } } - await foreach (var state in activeTaskList.ToAsyncEnumerable().Zip(GetFailures(cancellationToken), - (cloud, state) => new CloudTaskBatchTaskState(cloud.Id, state)) + await foreach (var state in activeTaskList.ToAsyncEnumerable() + .Zip(GetFailures(cancellationToken), (cloud, state) => new CloudTaskBatchTaskState(cloud.Id, state)) .WithCancellation(cancellationToken)) { yield return state; @@ -304,13 +294,13 @@ static AzureBatchTaskState.FailureInformation ParseComputeNodeErrors(IReadOnlyLi async IAsyncEnumerable GetFailures([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously { - for (var failure = PopNextStartTaskFailure(); failure is not null; failure = PopNextStartTaskFailure()) + foreach (var failure in RepeatUntil(PopNextStartTaskFailure, failure => failure is null)) { yield return ConvertFromStartTask(failure); cancellationToken.ThrowIfCancellationRequested(); } - for (var failure = PopNextResizeError(); failure is not null; failure = PopNextResizeError()) + foreach (var failure in RepeatUntil(PopNextResizeError, failure => failure is null)) { yield return ConvertFromResize(failure); cancellationToken.ThrowIfCancellationRequested(); @@ -364,6 +354,22 @@ AzureBatchTaskState GetCompletedBatchState(CloudTask task) static string FormatNameValuePair(NameValuePair pair) => $"{pair.Name}: {pair.Value}"; + + static IEnumerable RepeatUntil(Func func, Predicate stop) + { + do + { + var t = func(); + + if (stop(t)) + { + yield break; + } + + yield return t; + } + while (true); + } } } } diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index 63c378420..d32d2e03c 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -15,7 +15,7 @@ namespace TesApi.Web { /// - /// A background service that schedules TES tasks in the batch system, orchestrates their lifecycle, and updates their state. + /// A background service that schedules s in the batch system, orchestrates their lifecycle, and updates their state. /// This should only be used as a system-wide singleton service. This class does not support scale-out on multiple machines, /// nor does it implement a leasing mechanism. In the future, consider using the Lease Blob operation. /// @@ -30,27 +30,22 @@ internal class TaskScheduler : OrchestrateOnBatchSchedulerServiceBase /// /// The task node event processor. /// Used for requesting termination of the current application during initialization. - /// The main TES task database repository implementation - /// The batch scheduler implementation - /// The logger instance + /// The main TES task database repository implementation. + /// The batch scheduler implementation. + /// The logger instance. public TaskScheduler(RunnerEventsProcessor nodeEventProcessor, Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger logger) : base(hostApplicationLifetime, repository, batchScheduler, logger) { this.nodeEventProcessor = nodeEventProcessor; } - - /// - /// The main thread that continuously schedules TES tasks in the batch system - /// - /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. - /// A System.Threading.Tasks.Task that represents the long running operations. - protected override async Task ExecuteSetupAsync(CancellationToken stoppingToken) + /// + protected override async ValueTask ExecuteSetupAsync(CancellationToken cancellationToken) { try { - // Delay "starting" Scheduler until this completes to finish initializing BatchScheduler. - await batchScheduler.UploadTaskRunnerIfNeeded(stoppingToken); + // Delay "starting" TaskScheduler until this completes to finish initializing BatchScheduler. + await batchScheduler.UploadTaskRunnerIfNeeded(cancellationToken); } catch (Exception exc) { @@ -59,76 +54,72 @@ protected override async Task ExecuteSetupAsync(CancellationToken stoppingToken) } } - /// - /// The main thread that continuously schedules TES tasks in the batch system - /// - /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. - /// A System.Threading.Tasks.Task that represents the long running operations. - protected override Task ExecuteCoreAsync(CancellationToken stoppingToken) + /// + protected override async ValueTask ExecuteCoreAsync(CancellationToken cancellationToken) { - return Task.WhenAll( - ExecuteCancelledTesTasksOnBatchAsync(stoppingToken), - ExecuteQueuedTesTasksOnBatchAsync(stoppingToken), - ExecuteUpdateTesTaskFromEventBlobAsync(stoppingToken)); + await Task.WhenAll( + ExecuteCancelledTesTasksOnBatchAsync(cancellationToken), + ExecuteQueuedTesTasksOnBatchAsync(cancellationToken), + ExecuteUpdateTesTaskFromEventBlobAsync(cancellationToken)); } /// /// Retrieves all queued TES tasks from the database, performs an action in the batch system, and updates the resultant state /// /// - private Task ExecuteQueuedTesTasksOnBatchAsync(CancellationToken stoppingToken) + private async Task ExecuteQueuedTesTasksOnBatchAsync(CancellationToken cancellationToken) { var query = new Func>>( - async cancellationToken => (await repository.GetItemsAsync( + async token => (await repository.GetItemsAsync( predicate: t => t.State == TesState.QUEUEDEnum, - cancellationToken: cancellationToken)) + cancellationToken: token)) .OrderBy(t => t.CreationTime) .ToAsyncEnumerable()); - return ExecuteActionOnIntervalAsync(batchRunInterval, - cancellationToken => OrchestrateTesTasksOnBatchAsync("Queued", query, batchScheduler.ProcessQueuedTesTasksAsync, cancellationToken), - stoppingToken); + await ExecuteActionOnIntervalAsync(batchRunInterval, + token => OrchestrateTesTasksOnBatchAsync("Queued", query, batchScheduler.ProcessQueuedTesTasksAsync, token), + cancellationToken); } /// /// Retrieves all cancelled TES tasks from the database, performs an action in the batch system, and updates the resultant state /// - /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. + /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. /// - private Task ExecuteCancelledTesTasksOnBatchAsync(CancellationToken stoppingToken) + private async Task ExecuteCancelledTesTasksOnBatchAsync(CancellationToken cancellationToken) { var query = new Func>>( - async cancellationToken => (await repository.GetItemsAsync( + async token => (await repository.GetItemsAsync( predicate: t => t.State == TesState.CANCELINGEnum, - cancellationToken: cancellationToken)) + cancellationToken: token)) .OrderByDescending(t => t.CreationTime) .ToAsyncEnumerable()); - return ExecuteActionOnIntervalAsync(batchRunInterval, - cancellationToken => OrchestrateTesTasksOnBatchAsync( + await ExecuteActionOnIntervalAsync(batchRunInterval, + token => OrchestrateTesTasksOnBatchAsync( "Cancelled", query, - (tasks, cancellationToken) => batchScheduler.ProcessTesTaskBatchStatesAsync( + (tasks, ct) => batchScheduler.ProcessTesTaskBatchStatesAsync( tasks, Enumerable.Repeat(new(AzureBatchTaskState.TaskState.CancellationRequested), tasks.Length).ToArray(), - cancellationToken), - cancellationToken), - stoppingToken); + ct), + token), + cancellationToken); } /// /// Retrieves all event blobs from storage and updates the resultant state. /// - /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. + /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. /// - private Task ExecuteUpdateTesTaskFromEventBlobAsync(CancellationToken stoppingToken) + private async Task ExecuteUpdateTesTaskFromEventBlobAsync(CancellationToken cancellationToken) { - return ExecuteActionOnIntervalAsync(blobRunInterval, - async cancellationToken => + await ExecuteActionOnIntervalAsync(blobRunInterval, + async token => await UpdateTesTasksFromAvailableEventsAsync( - await ParseAvailableEvents(cancellationToken), - cancellationToken), - stoppingToken); + await ParseAvailableEvents(token), + token), + cancellationToken); } /// @@ -136,12 +127,12 @@ await ParseAvailableEvents(cancellationToken), /// /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. /// s and s from all events. - async ValueTask MarkProcessedAsync)>> ParseAvailableEvents(CancellationToken cancellationToken) + private async ValueTask MarkProcessedAsync)>> ParseAvailableEvents(CancellationToken cancellationToken) { var messages = new ConcurrentBag<(RunnerEventsMessage Message, TesTask Task, AzureBatchTaskState State, Func MarkProcessedAsync)>(); // Get and parse event blobs - await Parallel.ForEachAsync(batchScheduler.GetEventMessagesAsync(cancellationToken), cancellationToken, async (eventMessage, cancellationToken) => + await Parallel.ForEachAsync(batchScheduler.GetEventMessagesAsync(cancellationToken), cancellationToken, async (eventMessage, token) => { var tesTask = await GetTesTaskAsync(eventMessage.Tags["task-id"], eventMessage.Tags["event-name"]); @@ -153,9 +144,23 @@ await Parallel.ForEachAsync(batchScheduler.GetEventMessagesAsync(cancellationTok try { nodeEventProcessor.ValidateMessageMetadata(eventMessage); - eventMessage = await nodeEventProcessor.DownloadAndValidateMessageContentAsync(eventMessage, cancellationToken); - var state = await nodeEventProcessor.GetMessageBatchStateAsync(eventMessage, tesTask, cancellationToken); - messages.Add((eventMessage, tesTask, state, token => nodeEventProcessor.MarkMessageProcessedAsync(eventMessage, token))); + eventMessage = await nodeEventProcessor.DownloadAndValidateMessageContentAsync(eventMessage, token); + var state = await nodeEventProcessor.GetMessageBatchStateAsync(eventMessage, tesTask, token); + messages.Add((eventMessage, tesTask, state, ct => nodeEventProcessor.MarkMessageProcessedAsync(eventMessage, ct))); + } + catch (ArgumentException ex) + { + logger.LogError(ex, @"Verifying event metadata failed: {ErrorMessage}", ex.Message); + + messages.Add(( + eventMessage, + tesTask, + new(AzureBatchTaskState.TaskState.InfoUpdate, Warning: new List + { + "EventParsingFailed", + $"{ex.GetType().FullName}: {ex.Message}" + }), + ct => nodeEventProcessor.RemoveMessageFromReattemptsAsync(eventMessage, ct))); } catch (Exception ex) { @@ -169,18 +174,16 @@ await Parallel.ForEachAsync(batchScheduler.GetEventMessagesAsync(cancellationTok "EventParsingFailed", $"{ex.GetType().FullName}: {ex.Message}" }), - (ex is System.Diagnostics.UnreachableException || ex is RunnerEventsProcessor.AssertException) - ? token => nodeEventProcessor.MarkMessageProcessedAsync(eventMessage, token) // Don't retry this event + (ex is System.Diagnostics.UnreachableException || ex is RunnerEventsProcessor.DownloadOrParseException) + ? ct => nodeEventProcessor.MarkMessageProcessedAsync(eventMessage, ct) // Mark event processed to prevent retries : default)); // Retry this event. - - return; } // Helpers async ValueTask GetTesTaskAsync(string id, string @event) { TesTask tesTask = default; - if (await repository.TryGetItemAsync(id, cancellationToken, task => tesTask = task) && tesTask is not null) + if (await repository.TryGetItemAsync(id, token, task => tesTask = task) && tesTask is not null) { logger.LogDebug("Completing event '{TaskEvent}' for task {TesTask}.", @event, tesTask.Id); return tesTask; @@ -197,12 +200,12 @@ async ValueTask GetTesTaskAsync(string id, string @event) } /// - /// Updates each task based on the provided states. + /// Updates each task based on the provided state. /// /// A collection of associated s, s, and a method to mark the source event processed. /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. /// - async ValueTask UpdateTesTasksFromAvailableEventsAsync(IEnumerable<(TesTask Task, AzureBatchTaskState State, Func MarkProcessedAsync)> eventStates, CancellationToken cancellationToken) + private async ValueTask UpdateTesTasksFromAvailableEventsAsync(IEnumerable<(TesTask Task, AzureBatchTaskState State, Func MarkProcessedAsync)> eventStates, CancellationToken cancellationToken) { eventStates = eventStates.ToList(); @@ -221,11 +224,11 @@ await OrchestrateTesTasksOnBatchAsync( cancellationToken, "events"); - await Parallel.ForEachAsync(eventStates.Select(@event => @event.MarkProcessedAsync).Where(func => func is not null), cancellationToken, async (markEventProcessed, cancellationToken) => + await Parallel.ForEachAsync(eventStates.Select(@event => @event.MarkProcessedAsync).Where(func => func is not null), cancellationToken, async (markEventProcessed, token) => { try { - await markEventProcessed(cancellationToken); + await markEventProcessed(token); } catch (Exception ex) { From 12bfb3a5df9d9e31cd97e2262475ea10821d693e Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 17 Nov 2023 10:14:57 -0800 Subject: [PATCH 094/202] Missed changes from the merge from 'main' --- src/Tes.ApiClients/CachingRetryHandler.cs | 4 ++-- src/Tes.ApiClients/HttpApiClient.cs | 4 ++-- src/Tes.ApiClients/RetryHandler.cs | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Tes.ApiClients/CachingRetryHandler.cs b/src/Tes.ApiClients/CachingRetryHandler.cs index 5bc94e775..b82c7477c 100644 --- a/src/Tes.ApiClients/CachingRetryHandler.cs +++ b/src/Tes.ApiClients/CachingRetryHandler.cs @@ -46,7 +46,7 @@ protected CachingRetryHandler() { } /// A for controlling the lifetime of the asynchronous operation. /// /// - public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, CancellationToken cancellationToken, Context? context = default) + public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, CancellationToken cancellationToken, Context context = default) { ValidateArgs(cacheKey, action); @@ -63,7 +63,7 @@ public virtual async Task ExecuteWithRetryAndCachingAsync(stri /// /// /// - public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, DateTimeOffset cachesExpires, CancellationToken cancellationToken, Context? context = default) + public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, DateTimeOffset cachesExpires, CancellationToken cancellationToken, Context context = default) { ValidateArgs(cacheKey, action); diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index 03b2cd3b1..3d3bae185 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -74,7 +74,7 @@ protected HttpApiClient() { } /// /// Calling method name. /// - private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string? caller = default) + private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string caller = default) => new((exception, timeSpan, retryCount, correlationId) => { Logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan} CorrelationId: {CorrelationId:D}", caller, retryCount, timeSpan, correlationId); @@ -86,7 +86,7 @@ private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime. /// See /// Calling method name. /// - private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string? caller = default) + private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string caller = default) => new((result, timeSpan, retryCount, correlationId) => { if (result.Exception is null) diff --git a/src/Tes.ApiClients/RetryHandler.cs b/src/Tes.ApiClients/RetryHandler.cs index f8d378b65..431b03dc2 100644 --- a/src/Tes.ApiClients/RetryHandler.cs +++ b/src/Tes.ApiClients/RetryHandler.cs @@ -111,7 +111,7 @@ protected RetryHandler() { } /// /// Result type /// Result instance - public TResult ExecuteWithRetry(Func action, Context? context = default) + public TResult ExecuteWithRetry(Func action, Context context = default) { ArgumentNullException.ThrowIfNull(action); @@ -140,7 +140,7 @@ public TResult ExecuteWithRetry(Func action, Context? context /// /// Result type /// Result instance - public virtual Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, Context? context = default) + public virtual Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, Context context = default) { ArgumentNullException.ThrowIfNull(action); @@ -154,7 +154,7 @@ public virtual Task ExecuteWithRetryAsync(FuncA for controlling the lifetime of the asynchronous operation. /// /// Result instance - public async Task ExecuteWithRetryAsync(Func action, CancellationToken cancellationToken, Context? context = default) + public async Task ExecuteWithRetryAsync(Func action, CancellationToken cancellationToken, Context context = default) { ArgumentNullException.ThrowIfNull(action); @@ -168,7 +168,7 @@ public async Task ExecuteWithRetryAsync(Func action, Ca /// A for controlling the lifetime of the asynchronous operation. /// /// Result HttpResponse - public virtual async Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, Context? context = default) + public virtual async Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, Context context = default) { ArgumentNullException.ThrowIfNull(action); @@ -183,7 +183,7 @@ public static void SetOnRetryHandler(this Context context, RetryHandler.OnRet context[RetryHandler.OnRetryHandlerKey] = onRetry; } - public static RetryHandler.OnRetryHandler? GetOnRetryHandler(this Context context) + public static RetryHandler.OnRetryHandler GetOnRetryHandler(this Context context) { return context.TryGetValue(RetryHandler.OnRetryHandlerKey, out var handler) ? (RetryHandler.OnRetryHandler)handler : default; } @@ -193,7 +193,7 @@ public static void SetOnRetryHandler(this Context context, RetryHandler.OnRetryH context[RetryHandler.OnRetryHandlerKey] = onRetry; } - public static RetryHandler.OnRetryHandler? GetOnRetryHandler(this Context context) + public static RetryHandler.OnRetryHandler GetOnRetryHandler(this Context context) { return context.TryGetValue(RetryHandler.OnRetryHandlerKey, out var handler) ? (RetryHandler.OnRetryHandler)handler : default; } From d859c2dd464488684f3d14a26657393d69aa5fb4 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 20 Nov 2023 16:33:51 -0800 Subject: [PATCH 095/202] Stage compute node ids and parallelize portions of queued task handling --- src/TesApi.Web/BatchPool.cs | 73 +++--- src/TesApi.Web/BatchPools.BatchScheduler.cs | 2 +- src/TesApi.Web/BatchScheduler.cs | 214 ++++++++++++------ src/TesApi.Web/IBatchPool.cs | 12 +- .../OrchestrateOnBatchSchedulerServiceBase.cs | 2 +- src/TesApi.Web/PoolScheduler.cs | 65 +++--- 6 files changed, 231 insertions(+), 137 deletions(-) diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 0f3bf46ef..0e167feb6 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -314,12 +314,8 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc switch (_scalingMode) { - case ScalingMode.AutoScaleEnabled when autoScaleEnabled != true: - _scalingMode = ScalingMode.RemovingFailedNodes; - break; - - case ScalingMode.AutoScaleEnabled when autoScaleEnabled == true: - if (_resetAutoScalingRequired || await GetNodesToRemove(false).AnyAsync(cancellationToken)) + case ScalingMode.AutoScaleEnabled: + if (_resetAutoScalingRequired || await (await GetNodesToRemove()).AnyAsync(cancellationToken)) { _logger.LogInformation(@"Switching pool {PoolId} to manual scale to clear resize errors and/or compute nodes in invalid states.", Id); await _azureProxy.DisableBatchPoolAutoScaleAsync(Id, cancellationToken); @@ -332,7 +328,7 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc var nodesToRemove = Enumerable.Empty(); // It's documented that a max of 100 nodes can be removed at a time. Excess eligible nodes will be removed in a future call to this method. - await foreach (var node in GetNodesToRemove(true).Take(MaxComputeNodesToRemoveAtOnce).WithCancellation(cancellationToken)) + await foreach (var node in (await GetNodesToRemove()).Take(MaxComputeNodesToRemoveAtOnce).WithCancellation(cancellationToken)) { switch (node.State) { @@ -534,25 +530,34 @@ internal async ValueTask ServicePoolAsync(ServiceKind serviceKind, CancellationT /// public async ValueTask ServicePoolAsync(CancellationToken cancellationToken) { +#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously async ValueTask StandupQueries() +#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously { + _taskPreviousComputeNodeIds.Clear(); + _foundTasks.ForEach(task => + { + if (PoolScheduler.TaskListWithComputeNodeInfoPredicate(task)) + { + _taskPreviousComputeNodeIds.Add(task.Id, task.ComputeNodeInformation.ComputeNodeId); + } + }); + // List tasks from batch just one time each time we service the pool when called from PoolScheduler - _foundTasks = await GetTasksAsync("creationTime,executionInfo,id,nodeInfo,state,stateTransitionTime", null).ToListAsync(cancellationToken); + _foundTasks.Clear(); + _foundTasks.AddRange(GetTasksAsync("creationTime,executionInfo,id,nodeInfo,state,stateTransitionTime", null).ToBlockingEnumerable(cancellationToken)); + _logger.LogDebug("{PoolId}: {TaskCount} tasks discovered.", Id, _foundTasks.Count); // List nodes from Batch at most one time each time we service the pool - if (_foundTasks.Where(PoolScheduler.TaskListWithComputeNodeInfoPredicate).Any()) - { - var nodes = (await _azureProxy.ListComputeNodesAsync(Id, - new ODATADetailLevel(filterClause: @"state eq 'starttaskfailed' or state eq 'preempted' or state eq 'unusable'", selectClause: @"errors,id,state,startTaskInfo")) + _lazyComputeNodes = _taskPreviousComputeNodeIds.Count == 0 + + ? new(() => Task.FromResult(_azureProxy.ListComputeNodesAsync(Id, + new ODATADetailLevel(filterClause: EjectableComputeNodesFilterClause, selectClause: EjectableComputeNodesSelectClause())))) + + : new(async () => (await _azureProxy.ListComputeNodesAsync(Id, + new ODATADetailLevel(filterClause: EjectableComputeNodesFilterClause, selectClause: EjectableComputeNodesSelectClause())) .ToListAsync(cancellationToken)) - .ToAsyncEnumerable(); - _lazyComputeNodes = _ => new(nodes); - } - else - { - _lazyComputeNodes = withState => new(_azureProxy.ListComputeNodesAsync(Id, - new ODATADetailLevel(filterClause: @"state eq 'starttaskfailed' or state eq 'preempted' or state eq 'unusable'", selectClause: withState.Value ? @"id,state,startTaskInfo" : @"id"))); - } + .ToAsyncEnumerable()); } var exceptions = new List(); @@ -635,26 +640,38 @@ async ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken ca } } - private IEnumerable _foundTasks = Enumerable.Empty(); + private readonly List _foundTasks = new(); + private readonly Dictionary _taskPreviousComputeNodeIds = new(); - private Func>> _lazyComputeNodes; + private Lazy>> _lazyComputeNodes; + private const string EjectableComputeNodesFilterClause = @"state eq 'starttaskfailed' or state eq 'preempted' or state eq 'unusable'"; + private string EjectableComputeNodesSelectClause() + => ScalingMode.AutoScaleDisabled.Equals(_scalingMode) switch + { + false => _taskPreviousComputeNodeIds.Count == 0 // Not removing compute nodes + ? @"id" // Not servicing tasks by compute node + : @"errors,id,state", // Servicing tasks by compute node + true => _taskPreviousComputeNodeIds.Count == 0 // Possibly removing compute nodes + ? @"id,state,startTaskInfo" // Not servicing tasks by compute node + : @"errors,id,state,startTaskInfo", // Servicing tasks by compute node + }; - private IAsyncEnumerable GetNodesToRemove(bool withState) - => _lazyComputeNodes(withState).Value; + private async ValueTask> GetNodesToRemove() + => await _lazyComputeNodes.Value; /// - public IAsyncEnumerable ListLostComputeNodesAsync() + public async Task> ListEjectableComputeNodesAsync() { - return _lazyComputeNodes(null).Value.Where(node => !ComputeNodeState.StartTaskFailed.Equals(node.State)); + return (await _lazyComputeNodes.Value).Where(node => !ComputeNodeState.StartTaskFailed.Equals(node.State)); } private IEnumerable GetTasks(bool includeCompleted) => _foundTasks.Where(task => includeCompleted || !TaskState.Completed.Equals(task.State)); /// - public IAsyncEnumerable ListCloudTasksAsync() + public IEnumerable ListCloudTasksAsync() { - return _foundTasks.ToAsyncEnumerable(); + return _foundTasks.Select(task => new CloudTaskWithPreviousComputeNodeId(task, _taskPreviousComputeNodeIds.TryGetValue(task.Id, out var nodeId) ? nodeId : default)); } /// diff --git a/src/TesApi.Web/BatchPools.BatchScheduler.cs b/src/TesApi.Web/BatchPools.BatchScheduler.cs index 31e7be03d..b2732d2af 100644 --- a/src/TesApi.Web/BatchPools.BatchScheduler.cs +++ b/src/TesApi.Web/BatchPools.BatchScheduler.cs @@ -27,7 +27,7 @@ public partial class BatchScheduler internal delegate ValueTask ModelPoolFactory(string poolId, CancellationToken cancellationToken); - private (string PoolKey, string DisplayName) GetPoolKey(TesTask tesTask, VirtualMachineInformation virtualMachineInformation, BatchModels.ContainerConfiguration containerConfiguration, IList identities, CancellationToken cancellationToken) + private (string PoolKey, string DisplayName) GetPoolKey(TesTask tesTask, VirtualMachineInformation virtualMachineInformation, BatchModels.ContainerConfiguration containerConfiguration, IList identities) { var identityResourceIds = ""; diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 274518855..0d3c3680a 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -3,7 +3,9 @@ using System; +using System.Collections.Concurrent; using System.Collections.Generic; +using System.Collections.Immutable; using System.IO; using System.Linq; using System.Text; @@ -576,19 +578,21 @@ public string GetTesTaskIdFromCloudTaskId(string cloudTaskId) private static bool IsCromwellCommandScript(TesInput inputFile) => (inputFile.Name?.Equals("commandScript") ?? false) && (inputFile.Description?.EndsWith(".commandScript") ?? false) && inputFile.Type == TesFileType.FILEEnum && inputFile.Path.EndsWith($"/{CromwellScriptFileName}"); + private record struct ContainerMetadata(BatchModels.ContainerConfiguration ContainerConfiguration, (bool ExecutorImage, bool DockerInDockerImage, bool CromwellDrsImage) IsPublic); + private record struct QueuedTaskMetadata(TesTask TesTask, VirtualMachineInformation VirtualMachineInfo, ContainerMetadata ContainerMetadata, IEnumerable Identities, string PoolDisplayName); + /// public async IAsyncEnumerable> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { - var tasksMetadataByPoolKey = new Dictionary Identities, string PoolDisplayName)>>(); - var poolKeyByTaskIds = new Dictionary(); // Reverse lookup of 'tasksMetadataByPoolKey' + var tasksMetadataByPoolKey = new ConcurrentDictionary>(); + ConcurrentBag> results = new(); // Early item return facilitator { - var tasks = tesTasks.ToList(); // List of tasks that will make it to the next round. + logger.LogDebug(@"Checking quota for {QueuedTasks} tasks.", tesTasks.Length); // Determine how many nodes in each pool we might need for this group. - foreach (var tesTask in tesTasks) // TODO: Consider parallelizing this foreach loop. + await Parallel.ForEachAsync(tesTasks, cancellationToken, async (tesTask, token) => { - Task quickResult = default; // fast exit enabler string poolKey = default; var identities = new List(); @@ -604,43 +608,44 @@ public async IAsyncEnumerable> ProcessQueuedTesTasksA try { - var virtualMachineInfo = await GetVmSizeAsync(tesTask, cancellationToken); - var containerMetadata = await GetContainerConfigurationIfNeededAsync(tesTask, cancellationToken); - (poolKey, var displayName) = GetPoolKey(tesTask, virtualMachineInfo, containerMetadata.ContainerConfiguration, identities, cancellationToken); - await quotaVerifier.CheckBatchAccountQuotasAsync(virtualMachineInfo, needPoolOrJobQuotaCheck: !IsPoolAvailable(poolKey), cancellationToken: cancellationToken); + var virtualMachineInfo = await GetVmSizeAsync(tesTask, token); + var containerMetadata = await GetContainerConfigurationIfNeededAsync(tesTask, token); + (poolKey, var displayName) = GetPoolKey(tesTask, virtualMachineInfo, containerMetadata.ContainerConfiguration, identities); + await quotaVerifier.CheckBatchAccountQuotasAsync(virtualMachineInfo, needPoolOrJobQuotaCheck: !IsPoolAvailable(poolKey), cancellationToken: token); - if (tasksMetadataByPoolKey.TryGetValue(poolKey, out var resource)) + try { - resource.Add((tesTask, virtualMachineInfo, containerMetadata, identities, displayName)); + _ = tasksMetadataByPoolKey.AddOrUpdate(poolKey, + _1 => ImmutableArray.Empty.Add(new(tesTask, virtualMachineInfo, containerMetadata, identities, displayName)), + (_1, list) => list.Add(new(tesTask, virtualMachineInfo, containerMetadata, identities, displayName))); } - else + catch (OverflowException) { - tasksMetadataByPoolKey.Add(poolKey, new() { (tesTask, virtualMachineInfo, containerMetadata, identities, displayName) }); + throw; } - - poolKeyByTaskIds.Add(tesTask.Id, poolKey); } catch (Exception ex) { - quickResult = HandleException(ex, poolKey, tesTask); + results.Add(new(HandleExceptionAsync(ex, poolKey, tesTask), tesTask)); } + }); + } - if (quickResult is not null) - { - tasks.Remove(tesTask); - yield return new(quickResult, tesTask); - } - } + // Return any results that are ready + foreach (var result in results) + { + yield return result; + } - // Remove already returned tasks from the dictionary - tasksMetadataByPoolKey = tasksMetadataByPoolKey - .Select(p => (p.Key, Value: p.Value.Where(v => tasks.Contains(v.TesTask)).ToList())) // keep only tasks that remain in the 'tasks' variable - .Where(t => t.Value.Count != 0) // Remove any now empty pool keys - .ToDictionary(p => p.Key, p => p.Value); + if (tasksMetadataByPoolKey.IsEmpty) + { + yield break; } + results.Clear(); + // Determine how many nodes in each new pool we might need for this group. - var neededPoolNodesByPoolKey = tasksMetadataByPoolKey.ToDictionary(t => t.Key, t => t.Value.Count); + var neededPoolNodesByPoolKey = tasksMetadataByPoolKey.ToDictionary(t => t.Key, t => t.Value.Length); { // Determine how many new pools/jobs we will need for this batch @@ -650,52 +655,56 @@ public async IAsyncEnumerable> ProcessQueuedTesTasksA // This will remove pool keys we cannot accomodate due to quota, along with all of their associated tasks, from being queued into Batch. if (requiredNewPools > 1) { - var (excess, exception) = await quotaVerifier.CheckBatchAccountPoolAndJobQuotasAsync(requiredNewPools, cancellationToken); - var initial = tasksMetadataByPoolKey.Count - 1; - var final = initial - excess; - - for (var i = initial; i > final; --i) + for (var (excess, exception) = await quotaVerifier.CheckBatchAccountPoolAndJobQuotasAsync(requiredNewPools, cancellationToken); + excess > 0; ) { - var key = tasksMetadataByPoolKey.Keys.ElementAt(i); - if (tasksMetadataByPoolKey.Remove(key, out var listOfTaskMetadata)) + var key = tasksMetadataByPoolKey.Keys.Last(); + if (tasksMetadataByPoolKey.TryRemove(key, out var listOfTaskMetadata)) { - foreach (var (task, _, _, _, _) in listOfTaskMetadata) + foreach (var task in listOfTaskMetadata.Select(m => m.TesTask)) { - yield return new(HandleException(exception, key, task), task); + yield return new(HandleExceptionAsync(exception, key, task), task); } + + excess--; } } } } + logger.LogDebug(@"Obtaining {PoolQuantity} batch pool identifiers for {QueuedTasks} tasks.", tasksMetadataByPoolKey.Count, tasksMetadataByPoolKey.Values.Sum(l => l.Length)); + + // TODO: Consider parallelizing this expression. Doing so would require making GetOrAddPoolAsync multi-threaded safe. + var tasksMetadata = tasksMetadataByPoolKey.ToAsyncEnumerable().SelectAwaitWithCancellation(async (pair, token) => + (pair.Key, Id: await GetPoolIdAsync(pair.Key, pair.Value, token), TaskMetadata: pair.Value)) + .Where(tuple => tuple.Id is not null) + .SelectMany(tuple => tuple.TaskMetadata.ToAsyncEnumerable().Select(metadata => (metadata.TesTask, metadata.VirtualMachineInfo, tuple.Key, tuple.Id))) + .ToBlockingEnumerable(cancellationToken); + + // Return any results that are ready + foreach (var result in results) + { + yield return result; + } + + if (!tasksMetadata.Any()) + { + yield break; + } + + results.Clear(); + + logger.LogDebug(@"Creating batch tasks."); + // Obtain assigned pool and create and assign the cloudtask for each task. - foreach (var (tesTask, virtualMachineInfo, containerMetadata, identities, displayName) in tasksMetadataByPoolKey.Values.SelectMany(e => e)) // TODO: Consider parallelizing this foreach loop. Would require making GetOrAddPoolAsync multi-threaded safe. + await Parallel.ForEachAsync(tasksMetadata, cancellationToken, async (metadata, token) => { - Task quickResult = default; // fast exit enabler - var poolKey = poolKeyByTaskIds[tesTask.Id]; + var (tesTask, virtualMachineInfo, poolKey, poolId) = metadata; try { - string poolId = null; var tesTaskLog = tesTask.AddTesTaskLog(); tesTaskLog.VirtualMachineInfo = virtualMachineInfo; - poolId = (await GetOrAddPoolAsync( - key: poolKey, - isPreemptable: virtualMachineInfo.LowPriority, - modelPoolFactory: async (id, ct) => await GetPoolSpecification( - name: id, - displayName: displayName, - poolIdentity: GetBatchPoolIdentity(identities.ToArray()), - vmSize: virtualMachineInfo.VmSize, - autoscaled: true, - preemptable: virtualMachineInfo.LowPriority, - initialTarget: neededPoolNodesByPoolKey[poolKey], - nodeInfo: (virtualMachineInfo.HyperVGenerations?.Contains("V2")).GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo, - containerConfiguration: containerMetadata.ContainerConfiguration, - encryptionAtHostSupported: virtualMachineInfo.EncryptionAtHostSupported, - cancellationToken: ct), - cancellationToken: cancellationToken)).Id; - var cloudTaskId = $"{tesTask.Id}-{tesTask.Logs.Count}"; tesTask.PoolId = poolId; var cloudTask = await ConvertTesTaskToBatchTaskUsingRunnerAsync(cloudTaskId, tesTask, cancellationToken); @@ -705,12 +714,14 @@ public async IAsyncEnumerable> ProcessQueuedTesTasksA tesTaskLog.StartTime = DateTimeOffset.UtcNow; tesTask.State = TesState.INITIALIZINGEnum; + results.Add(new(Task.FromResult(true), tesTask)); } catch (AggregateException aggregateException) { var exceptions = new List(); - foreach (var partResult in aggregateException.Flatten().InnerExceptions.Select(ex => HandleException(ex, poolKey, tesTask))) + foreach (var partResult in aggregateException.Flatten().InnerExceptions + .Select(ex => HandleExceptionAsync(ex, poolKey, tesTask))) { if (partResult.IsFaulted) { @@ -718,33 +729,90 @@ public async IAsyncEnumerable> ProcessQueuedTesTasksA } } - quickResult = exceptions.Count == 0 + results.Add(new(exceptions.Count == 0 ? Task.FromResult(true) - : Task.FromException(new AggregateException(exceptions)); + : Task.FromException(new AggregateException(exceptions)), + tesTask)); } catch (Exception exception) { - quickResult = HandleException(exception, poolKey, tesTask); + results.Add(new(HandleExceptionAsync(exception, poolKey, tesTask), tesTask)); } + }); - if (quickResult is not null) + foreach (var result in results) + { + yield return result; + } + + yield break; + + async ValueTask GetPoolIdAsync(string poolKey, IEnumerable metadata, CancellationToken cancellationToken) + { + metadata = metadata.ToList(); + var tasks = metadata.Select(m => m.TesTask); + var (_, virtualMachineInfo, containerMetadata, identities, displayName) = metadata.First(); + + try + { + return (await GetOrAddPoolAsync( + key: poolKey, + isPreemptable: virtualMachineInfo.LowPriority, + modelPoolFactory: async (id, ct) => await GetPoolSpecification( + name: id, + displayName: displayName, + poolIdentity: GetBatchPoolIdentity(identities.ToArray()), + vmSize: virtualMachineInfo.VmSize, + autoscaled: true, + preemptable: virtualMachineInfo.LowPriority, + initialTarget: neededPoolNodesByPoolKey[poolKey], + nodeInfo: (virtualMachineInfo.HyperVGenerations?.Contains("V2")).GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo, + containerConfiguration: containerMetadata.ContainerConfiguration, + encryptionAtHostSupported: virtualMachineInfo.EncryptionAtHostSupported, + cancellationToken: ct), + cancellationToken: cancellationToken)).Id; + } + catch (AggregateException aggregateException) { - yield return new(quickResult, tesTask); + var exceptions = new List(); + var innerExceptions = aggregateException.Flatten().InnerExceptions; + + foreach (var tesTask in tasks) + { + foreach (var partResult in innerExceptions + .Select(ex => HandleExceptionAsync(ex, poolKey, tesTask))) + { + if (partResult.IsFaulted) + { + exceptions.Add(partResult.Exception); + } + } + + results.Add(new(exceptions.Count == 0 + ? Task.FromResult(true) + : Task.FromException(new AggregateException(exceptions)), + tesTask)); + } } - else + catch (Exception exception) { - yield return new(Task.FromResult(true), tesTask); + foreach (var tesTask in tasks) + { + results.Add(new(HandleExceptionAsync(exception, poolKey, tesTask), tesTask)); + } } + + return null; } - Task HandleException(Exception exception, string poolKey, TesTask tesTask) + Task HandleExceptionAsync(Exception exception, string poolKey, TesTask tesTask) { switch (exception) { case AzureBatchPoolCreationException azureBatchPoolCreationException: if (!azureBatchPoolCreationException.IsTimeout && !azureBatchPoolCreationException.IsJobQuota && !azureBatchPoolCreationException.IsPoolQuota && azureBatchPoolCreationException.InnerException is not null) { - return HandleException(azureBatchPoolCreationException.InnerException, poolKey, tesTask); + return HandleExceptionAsync(azureBatchPoolCreationException.InnerException, poolKey, tesTask); } logger.LogWarning(azureBatchPoolCreationException, "TES task: {TesTask} AzureBatchPoolCreationException.Message: {ExceptionMessage}. This might be a transient issue. Task will remain with state QUEUED. Confirmed timeout: {ConfirmedTimeout}", tesTask.Id, azureBatchPoolCreationException.Message, azureBatchPoolCreationException.IsTimeout); @@ -1043,7 +1111,7 @@ var s when s.StartsWith("batch.node.centos ") => "sudo yum install epel-release /// A for controlling the lifetime of the asynchronous operation. /// // TODO: remove this as soon as the node runner can authenticate to container registries - private async ValueTask<(BatchModels.ContainerConfiguration ContainerConfiguration, (bool ExecutorImage, bool DockerInDockerImage, bool CromwellDrsImage) IsPublic)> GetContainerConfigurationIfNeededAsync(TesTask tesTask, CancellationToken cancellationToken) + private async ValueTask GetContainerConfigurationIfNeededAsync(TesTask tesTask, CancellationToken cancellationToken) { var drsImageNeeded = tesTask.Inputs?.Any(i => i?.Url?.StartsWith("drs://") ?? false) ?? false; // TODO: Support for multiple executors. Cromwell has single executor per task. @@ -1083,7 +1151,7 @@ var s when s.StartsWith("batch.node.centos ") => "sudo yum install epel-release } } - return result is null || result.ContainerRegistries.Count == 0 ? (default, (true, true, true)) : (result, (executorImageIsPublic, dockerInDockerIsPublic, cromwellDrsIsPublic)); + return result is null || result.ContainerRegistries.Count == 0 ? default : new(result, (executorImageIsPublic, dockerInDockerIsPublic, cromwellDrsIsPublic)); async ValueTask AddRegistryIfNeeded(string imageName) { @@ -1131,8 +1199,8 @@ private static BatchModels.BatchPoolIdentity GetBatchPoolIdentity(string[] ident /// private async ValueTask GetPoolSpecification(string name, string displayName, BatchModels.BatchPoolIdentity poolIdentity, string vmSize, bool autoscaled, bool preemptable, int initialTarget, BatchNodeInfo nodeInfo, BatchModels.ContainerConfiguration containerConfiguration, bool encryptionAtHostSupported, CancellationToken cancellationToken) { - ValidateString(name, nameof(name), 64); - ValidateString(displayName, nameof(displayName), 1024); + ValidateString(name, 64); + ValidateString(displayName, 1024); var vmConfig = new BatchModels.VirtualMachineConfiguration( imageReference: new BatchModels.ImageReference( @@ -1188,7 +1256,7 @@ private static BatchModels.BatchPoolIdentity GetBatchPoolIdentity(string[] ident return poolSpec; - static void ValidateString(string value, string paramName, int maxLength) + static void ValidateString(string value, int maxLength, [System.Runtime.CompilerServices.CallerArgumentExpression(nameof(value))] string paramName = null) { ArgumentNullException.ThrowIfNull(value, paramName); if (value.Length > maxLength) throw new ArgumentException($"{paramName} exceeds maximum length {maxLength}", paramName); diff --git a/src/TesApi.Web/IBatchPool.cs b/src/TesApi.Web/IBatchPool.cs index 5d996b1c3..0e78490fd 100644 --- a/src/TesApi.Web/IBatchPool.cs +++ b/src/TesApi.Web/IBatchPool.cs @@ -72,13 +72,13 @@ public interface IBatchPool /// Lists s running in pool's job. /// /// - IAsyncEnumerable ListCloudTasksAsync(); + IEnumerable ListCloudTasksAsync(); /// /// Lists s that are or . /// /// - IAsyncEnumerable ListLostComputeNodesAsync(); + Task> ListEjectableComputeNodesAsync(); /// /// Gets the last time the pool's compute node list was changed. @@ -88,10 +88,10 @@ public interface IBatchPool ValueTask GetAllocationStateTransitionTimeAsync(CancellationToken cancellationToken = default); /// - /// A not yet assigned a compute nodes to remove due to a nonrecoverable compute node or pool resize error. + /// A with a compute node id. /// - /// A s not yet assigned a compute node. - /// A compute node and/or pool resize error. - public record CloudTaskBatchTaskState(string CloudTaskId, AzureBatchTaskState TaskState); + /// A . + /// A compute node id or null. + public record CloudTaskWithPreviousComputeNodeId(CloudTask CloudTask, string PreviousComputeNodeId); } } diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 648f9f5aa..986f65358 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -287,7 +287,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< await batchScheduler.FlushPoolsAsync(pools, cancellationToken); } - logger.LogDebug("OrchestrateTesTasksOnBatch({Poll}) for {TaskCount} {UnitsLabel} completed in {TotalSeconds} seconds.", pollName, tesTasks.Where(task => task is not null).Count(), unitsLabel, DateTime.UtcNow.Subtract(startTime).TotalSeconds); + logger.LogDebug("OrchestrateTesTasksOnBatch({Poll}) for {TaskCount} {UnitsLabel} completed in {TotalSeconds:c}.", pollName, tesTasks.Where(task => task is not null).Count(), unitsLabel, DateTime.UtcNow.Subtract(startTime)); } } } diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index 905f01cac..ae223afb7 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -12,7 +12,7 @@ using Microsoft.Extensions.Logging; using Tes.Models; using Tes.Repository; -using CloudTaskBatchTaskState = TesApi.Web.IBatchPool.CloudTaskBatchTaskState; +using CloudTaskWithPreviousComputeNodeId = TesApi.Web.IBatchPool.CloudTaskWithPreviousComputeNodeId; namespace TesApi.Web { @@ -34,19 +34,16 @@ internal class PoolScheduler : OrchestrateOnBatchSchedulerServiceBase /// /// Predicate to obtain s (recently) running on s. Used to connect tasks and nodes together. /// - /// Shared between , , and to limit Batch API requests to a minimum. internal static bool TaskListWithComputeNodeInfoPredicate(CloudTask task) => !TaskState.Completed.Equals(task.State) && !string.IsNullOrEmpty(task.ComputeNodeInformation?.ComputeNodeId); /// /// Predicate to obtain s pending in Azure Batch. /// - /// Shared between and . private static bool ActiveTaskListPredicate(CloudTask task) => TaskState.Active.Equals(task.State); /// /// Predicate used to obtain s to backstop completing s in case of problems with the . /// - /// Shared between and . private static bool CompletedTaskListPredicate(CloudTask task, DateTime now) => TaskState.Completed.Equals(task.State) && task.StateTransitionTime < now - CompletedTaskListTimeSpan; /// @@ -123,20 +120,21 @@ await Parallel.ForEachAsync(pools, cancellationToken, async (pool, token) => /// s requiring attention. /// A for controlling the lifetime of the asynchronous operation. /// - private async ValueTask ProcessTasksAsync(IBatchPool pool, DateTime now, IAsyncEnumerable tasks, CancellationToken cancellationToken) + private async ValueTask ProcessTasksAsync(IBatchPool pool, DateTime now, IEnumerable tasks, CancellationToken cancellationToken) { - var batchStateCandidateTasks = AsyncEnumerable.Empty(); + var batchStateCandidateTasks = Enumerable.Empty(); var deletionCandidateTasks = AsyncEnumerable.Empty(); var deletionCandidateCreationCutoff = now - BatchScheduler.BatchDeleteNewTaskWorkaroundTimeSpan; var stateTransitionTimeCutoffForDeletions = now - StateTransitionTimeForDeletionTimeSpan; - await foreach (var task in tasks.WithCancellation(cancellationToken)) + foreach (var taskWithNodeId in tasks) { + var (task, computeNodeId) = taskWithNodeId; - if (TaskListWithComputeNodeInfoPredicate(task) || ActiveTaskListPredicate(task) || CompletedTaskListPredicate(task, now)) + if (!string.IsNullOrWhiteSpace(computeNodeId) || ActiveTaskListPredicate(task) || CompletedTaskListPredicate(task, now)) { - batchStateCandidateTasks = batchStateCandidateTasks.Append(task); + batchStateCandidateTasks = batchStateCandidateTasks.Append(taskWithNodeId); } if (TaskState.Completed.Equals(task.State) && task.CreationTime < deletionCandidateCreationCutoff && task.StateTransitionTime < stateTransitionTimeCutoffForDeletions) @@ -147,7 +145,7 @@ private async ValueTask ProcessTasksAsync(IBatchPool pool, DateTime now, IAsyncE await ProcessCloudTaskStatesAsync(pool.Id, GetCloudTaskStatesAsync(pool, now, batchStateCandidateTasks, cancellationToken), cancellationToken); - await ProcessDeletedTasks(deletionCandidateTasks, cancellationToken); + await ProcessTasksToDelete(deletionCandidateTasks, cancellationToken); } /// @@ -191,12 +189,12 @@ await OrchestrateTesTasksOnBatchAsync( } /// - /// Deletes completed s. + /// Deletes s. /// /// Tasks to delete. /// A for controlling the lifetime of the asynchronous operation. /// - private async ValueTask ProcessDeletedTasks(IAsyncEnumerable tasks, CancellationToken cancellationToken) + private async ValueTask ProcessTasksToDelete(IAsyncEnumerable tasks, CancellationToken cancellationToken) { await foreach (var taskResult in batchScheduler.DeleteCloudTasksAsync(tasks, cancellationToken).WithCancellation(cancellationToken)) { @@ -221,32 +219,40 @@ private async ValueTask ProcessDeletedTasks(IAsyncEnumerable - /// Obtains for updating s. + /// Obtains s for updating s. /// /// The associated with . /// Reference time. /// s which need s for further processing. /// A for controlling the lifetime of the asynchronous operation. - /// - private async IAsyncEnumerable GetCloudTaskStatesAsync(IBatchPool pool, DateTime now, IAsyncEnumerable tasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + /// for each that are associated with or assigned errors reported by batch. + /// If a task was running on a compute node reported as in a fatal state, that state will be reported for the task. Otherwise, pending tasks will be assigned resize and starttask failures and completed tasks will be reported as complete. + private async IAsyncEnumerable GetCloudTaskStatesAsync(IBatchPool pool, DateTime now, IEnumerable tasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { - List taskListWithComputeNodeInfo; // To check if the task was running when its node became preempted or unusable + List taskListWithComputeNodeInfo; // To check if the task was running when its node became preempted or unusable List activeTaskList; // These are candidates to be the victim of resizes or starttask failures List completedTaskList; // Backstop if events don't provide timely task completion information in a timely manner { - var taskList = await tasks.ToListAsync(cancellationToken); - taskListWithComputeNodeInfo = taskList.Where(TaskListWithComputeNodeInfoPredicate).ToList(); + var tasksWithNodeIds = tasks.ToList(); + taskListWithComputeNodeInfo = tasksWithNodeIds.Where(task => !string.IsNullOrWhiteSpace(task.PreviousComputeNodeId)).ToList(); + var taskList = tasksWithNodeIds.Select(task => task.CloudTask).ToList(); activeTaskList = taskList.Where(ActiveTaskListPredicate).OrderByDescending(task => task.StateTransitionTime?.ToUniversalTime()).ToList(); completedTaskList = taskList.Where(task => CompletedTaskListPredicate(task, now)).ToList(); } if (taskListWithComputeNodeInfo.Count > 0) { - await foreach (var node in pool.ListLostComputeNodesAsync().WithCancellation(cancellationToken)) + logger.LogDebug("{PoolId} reported nodes that will be removed. There are {tasksWithComputeNodeInfo} tasks that might be impacted.", pool.Id, taskListWithComputeNodeInfo.Count); + + await foreach (var node in (await pool.ListEjectableComputeNodesAsync()).WithCancellation(cancellationToken)) { - foreach (var task in taskListWithComputeNodeInfo.Where(task => node.Id.Equals(task.ComputeNodeInformation.ComputeNodeId, StringComparison.InvariantCultureIgnoreCase))) + foreach (var task in taskListWithComputeNodeInfo + .Where(task => node.Id.Equals(task.PreviousComputeNodeId, StringComparison.InvariantCultureIgnoreCase)) + .Select(task => task.CloudTask)) { + logger.LogDebug("{TaskId} connected to node {NodeId} in state {NodeState}.", task.Id, node.Id, node.State); + yield return new(task.Id, node.State switch { ComputeNodeState.Preempted => new(AzureBatchTaskState.TaskState.NodePreempted), @@ -254,14 +260,13 @@ private async IAsyncEnumerable GetCloudTaskStatesAsync( _ => throw new System.Diagnostics.UnreachableException(), }); + logger.LogDebug("Removing {TaskId} from consideration for other errors.", task.Id); _ = activeTaskList.Remove(task); } } } - await foreach (var state in activeTaskList.ToAsyncEnumerable() - .Zip(GetFailures(cancellationToken), (cloud, state) => new CloudTaskBatchTaskState(cloud.Id, state)) - .WithCancellation(cancellationToken)) + foreach (var state in activeTaskList.Zip(GetFailures(), (cloud, state) => new CloudTaskBatchTaskState(cloud.Id, state))) { yield return state; } @@ -290,9 +295,7 @@ static AzureBatchTaskState.FailureInformation ParseComputeNodeErrors(IReadOnlyLi } } -#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously - async IAsyncEnumerable GetFailures([System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) -#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + IEnumerable GetFailures() { foreach (var failure in RepeatUntil(PopNextStartTaskFailure, failure => failure is null)) { @@ -316,8 +319,7 @@ AzureBatchTaskState ConvertFromResize(ResizeError failure) AzureBatchTaskState ConvertFromStartTask(TaskFailureInformation failure) => new(AzureBatchTaskState.TaskState.NodeStartTaskFailed, Failure: new(failure.Code, Enumerable.Empty() - .Append(failure.Message) - .Append($"Start task failed ({failure.Category})") + .Append($"Start task failed ({failure.Category}): {failure.Message}") .Concat(failure.Details.Select(FormatNameValuePair)))); ResizeError PopNextResizeError() @@ -371,5 +373,12 @@ static IEnumerable RepeatUntil(Func func, Predicate stop) while (true); } } + + /// + /// A associated with a pool resize or node error. + /// + /// . + /// A compute node and/or pool resize error. + private record CloudTaskBatchTaskState(string CloudTaskId, AzureBatchTaskState TaskState); } } From e27682fa2f0803226b696157d3092753abd5aca2 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 21 Nov 2023 09:36:12 -0800 Subject: [PATCH 096/202] formatting and minor refactor --- src/TesApi.Web/BatchScheduler.cs | 125 ++++++++++++++++--------------- 1 file changed, 63 insertions(+), 62 deletions(-) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 0d3c3680a..f1f363d25 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -656,7 +656,7 @@ await Parallel.ForEachAsync(tesTasks, cancellationToken, async (tesTask, token) if (requiredNewPools > 1) { for (var (excess, exception) = await quotaVerifier.CheckBatchAccountPoolAndJobQuotasAsync(requiredNewPools, cancellationToken); - excess > 0; ) + excess > 0;) { var key = tasksMetadataByPoolKey.Keys.Last(); if (tasksMetadataByPoolKey.TryRemove(key, out var listOfTaskMetadata)) @@ -674,6 +674,64 @@ await Parallel.ForEachAsync(tesTasks, cancellationToken, async (tesTask, token) logger.LogDebug(@"Obtaining {PoolQuantity} batch pool identifiers for {QueuedTasks} tasks.", tasksMetadataByPoolKey.Count, tasksMetadataByPoolKey.Values.Sum(l => l.Length)); + async ValueTask GetPoolIdAsync(string poolKey, IEnumerable metadata, CancellationToken cancellationToken) + { + metadata = metadata.ToList(); + var tasks = metadata.Select(m => m.TesTask); + var (_, virtualMachineInfo, containerMetadata, identities, displayName) = metadata.First(); + + try + { + return (await GetOrAddPoolAsync( + key: poolKey, + isPreemptable: virtualMachineInfo.LowPriority, + modelPoolFactory: async (id, ct) => await GetPoolSpecification( + name: id, + displayName: displayName, + poolIdentity: GetBatchPoolIdentity(identities.ToArray()), + vmSize: virtualMachineInfo.VmSize, + autoscaled: true, + preemptable: virtualMachineInfo.LowPriority, + initialTarget: neededPoolNodesByPoolKey[poolKey], + nodeInfo: (virtualMachineInfo.HyperVGenerations?.Contains("V2")).GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo, + containerConfiguration: containerMetadata.ContainerConfiguration, + encryptionAtHostSupported: virtualMachineInfo.EncryptionAtHostSupported, + cancellationToken: ct), + cancellationToken: cancellationToken)).Id; + } + catch (AggregateException aggregateException) + { + var exceptions = new List(); + var innerExceptions = aggregateException.Flatten().InnerExceptions; + + foreach (var tesTask in tasks) + { + foreach (var partResult in innerExceptions + .Select(ex => HandleExceptionAsync(ex, poolKey, tesTask))) + { + if (partResult.IsFaulted) + { + exceptions.Add(partResult.Exception); + } + } + + results.Add(new(exceptions.Count == 0 + ? Task.FromResult(true) + : Task.FromException(new AggregateException(exceptions)), + tesTask)); + } + } + catch (Exception exception) + { + foreach (var tesTask in tasks) + { + results.Add(new(HandleExceptionAsync(exception, poolKey, tesTask), tesTask)); + } + } + + return null; + } + // TODO: Consider parallelizing this expression. Doing so would require making GetOrAddPoolAsync multi-threaded safe. var tasksMetadata = tasksMetadataByPoolKey.ToAsyncEnumerable().SelectAwaitWithCancellation(async (pair, token) => (pair.Key, Id: await GetPoolIdAsync(pair.Key, pair.Value, token), TaskMetadata: pair.Value)) @@ -681,6 +739,8 @@ await Parallel.ForEachAsync(tesTasks, cancellationToken, async (tesTask, token) .SelectMany(tuple => tuple.TaskMetadata.ToAsyncEnumerable().Select(metadata => (metadata.TesTask, metadata.VirtualMachineInfo, tuple.Key, tuple.Id))) .ToBlockingEnumerable(cancellationToken); + tasksMetadata = tasksMetadata.ToList(); + // Return any results that are ready foreach (var result in results) { @@ -694,7 +754,7 @@ await Parallel.ForEachAsync(tesTasks, cancellationToken, async (tesTask, token) results.Clear(); - logger.LogDebug(@"Creating batch tasks."); + logger.LogDebug(@"Creating batch tasks for {QueuedTasks} tasks.", tasksMetadata.Count()); // Obtain assigned pool and create and assign the cloudtask for each task. await Parallel.ForEachAsync(tasksMetadata, cancellationToken, async (metadata, token) => @@ -747,64 +807,6 @@ await Parallel.ForEachAsync(tasksMetadata, cancellationToken, async (metadata, t yield break; - async ValueTask GetPoolIdAsync(string poolKey, IEnumerable metadata, CancellationToken cancellationToken) - { - metadata = metadata.ToList(); - var tasks = metadata.Select(m => m.TesTask); - var (_, virtualMachineInfo, containerMetadata, identities, displayName) = metadata.First(); - - try - { - return (await GetOrAddPoolAsync( - key: poolKey, - isPreemptable: virtualMachineInfo.LowPriority, - modelPoolFactory: async (id, ct) => await GetPoolSpecification( - name: id, - displayName: displayName, - poolIdentity: GetBatchPoolIdentity(identities.ToArray()), - vmSize: virtualMachineInfo.VmSize, - autoscaled: true, - preemptable: virtualMachineInfo.LowPriority, - initialTarget: neededPoolNodesByPoolKey[poolKey], - nodeInfo: (virtualMachineInfo.HyperVGenerations?.Contains("V2")).GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo, - containerConfiguration: containerMetadata.ContainerConfiguration, - encryptionAtHostSupported: virtualMachineInfo.EncryptionAtHostSupported, - cancellationToken: ct), - cancellationToken: cancellationToken)).Id; - } - catch (AggregateException aggregateException) - { - var exceptions = new List(); - var innerExceptions = aggregateException.Flatten().InnerExceptions; - - foreach (var tesTask in tasks) - { - foreach (var partResult in innerExceptions - .Select(ex => HandleExceptionAsync(ex, poolKey, tesTask))) - { - if (partResult.IsFaulted) - { - exceptions.Add(partResult.Exception); - } - } - - results.Add(new(exceptions.Count == 0 - ? Task.FromResult(true) - : Task.FromException(new AggregateException(exceptions)), - tesTask)); - } - } - catch (Exception exception) - { - foreach (var tesTask in tasks) - { - results.Add(new(HandleExceptionAsync(exception, poolKey, tesTask), tesTask)); - } - } - - return null; - } - Task HandleExceptionAsync(Exception exception, string poolKey, TesTask tesTask) { switch (exception) @@ -826,8 +828,7 @@ Task HandleExceptionAsync(Exception exception, string poolKey, TesTask tes Microsoft.Rest.Azure.CloudException cloudException => cloudException.Body.Message, var e when e is BatchException batchException && batchException.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException batchErrorException => batchErrorException.Body.Message.Value, _ => "Unknown reason", - }, - Array.Empty()); + }); } break; From 48c19ac7fcea27f2098c86f6a682fa8a0151d40b Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 22 Nov 2023 23:54:33 -0800 Subject: [PATCH 097/202] Refactor based on feedback --- .../CacheAndRetryHandlerTest.cs | 29 +- .../TerraLandingZoneApiClientTest.cs | 20 +- .../TerraWsmApiClientTests.cs | 23 +- ...etryHandlersHelpers-HttpResponseMessage.cs | 41 + .../TestServices/RetryHandlersHelpers.cs | 44 + src/Tes.ApiClients/CachingRetryHandler.cs | 241 +++++- src/Tes.ApiClients/HttpApiClient.cs | 36 +- src/Tes.ApiClients/RetryHandler.cs | 787 ++++++++++++++---- src/Tes.ApiClients/Tes.ApiClients.csproj | 10 + src/Tes.Runner/Docker/DockerExecutor.cs | 8 +- .../ContainerRegistryProviderTests.cs | 25 +- src/TesApi.Tests/StartupTests.cs | 2 +- src/TesApi.Tests/TesApi.Tests.csproj | 4 + src/TesApi.Web/AzureProxy.cs | 50 +- .../CachingWithRetriesAzureProxy.cs | 142 ++-- .../Extensions/PagedInterfaceExtensions.cs | 15 +- src/TesApi.Web/Management/AzureProvider.cs | 10 +- .../Management/ContainerRegistryProvider.cs | 6 +- src/TesApi.Web/Startup.cs | 1 + 19 files changed, 1151 insertions(+), 343 deletions(-) create mode 100644 src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers-HttpResponseMessage.cs create mode 100644 src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs diff --git a/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs b/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs index 84accd22a..18c04a853 100644 --- a/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs +++ b/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs @@ -13,7 +13,8 @@ namespace Tes.ApiClients.Tests; public class CacheAndRetryHandlerTest { private IMemoryCache appCache = null!; - private CachingRetryHandler cachingRetryHandler = null!; + private CachingRetryHandler.ICachingAsyncPolicy cachingAsyncPolicy = null!; + private CachingRetryHandler.ICachingAsyncPolicy cachingAsyncHttpResponseMessagePolicy = null!; private Mock mockInstanceToRetry = null!; private const int MaxRetryCount = 3; @@ -24,7 +25,19 @@ public void SetUp() appCache = new MemoryCache(new MemoryCacheOptions()); mockInstanceToRetry = new Mock(); mockOptions.SetupGet(x => x.Value).Returns(new RetryPolicyOptions { ExponentialBackOffExponent = 1, MaxRetryCount = MaxRetryCount }); - cachingRetryHandler = new(appCache, mockOptions.Object); + var cachingRetryHandler = new CachingRetryHandler(appCache, mockOptions.Object); + + cachingAsyncHttpResponseMessagePolicy = cachingRetryHandler + .RetryDefaultHttpResponseMessagePolicyBuilder() + .SetOnRetryBehavior() + .AddCaching() + .BuildAsync(); + + cachingAsyncPolicy = cachingRetryHandler + .RetryDefaultPolicyBuilder() + .SetOnRetryBehavior() + .AddCaching() + .BuildAsync(); } [TestCleanup] @@ -38,7 +51,7 @@ public async Task ExecuteWithRetryAsync_RetriesMaxTimes() { mockInstanceToRetry.Setup(o => o.ToString()).Throws(); - await Assert.ThrowsExceptionAsync(() => cachingRetryHandler.ExecuteWithRetryAsync(_ => Task.Run(() => mockInstanceToRetry.Object.ToString()), CancellationToken.None)); + await Assert.ThrowsExceptionAsync(() => cachingAsyncPolicy.ExecuteWithRetryAsync(_ => Task.Run(() => mockInstanceToRetry.Object.ToString()), CancellationToken.None)); mockInstanceToRetry.Verify(o => o.ToString(), Times.Exactly(MaxRetryCount + 1)); // 3 retries (MaxRetryCount), plus original call } @@ -47,7 +60,7 @@ public async Task ExecuteWithRetryAsync_ReturnsValueAndOneExecutionOnSuccess() { mockInstanceToRetry.Setup(o => o.ToString()).Returns("foo"); - var value = await cachingRetryHandler.ExecuteWithRetryAsync(_ => Task.Run(() => mockInstanceToRetry.Object.ToString()), CancellationToken.None); + var value = await cachingAsyncPolicy.ExecuteWithRetryAsync(_ => Task.Run(() => mockInstanceToRetry.Object.ToString()), CancellationToken.None); mockInstanceToRetry.Verify(o => o.ToString(), Times.Once); Assert.AreEqual("foo", value); } @@ -58,8 +71,8 @@ public async Task ExecuteWithRetryAndCachingAsync_ValueIsCachedOnSuccessMethodCa var cacheKey = Guid.NewGuid().ToString(); mockInstanceToRetry.Setup(o => o.ToString()).Returns("foo"); - var first = await cachingRetryHandler.ExecuteWithRetryAndCachingAsync(cacheKey, _ => Task.Run(() => mockInstanceToRetry.Object.ToString()), CancellationToken.None); - var second = await cachingRetryHandler.ExecuteWithRetryAndCachingAsync(cacheKey, _ => Task.Run(() => mockInstanceToRetry.Object.ToString()), CancellationToken.None); + var first = await cachingAsyncPolicy.ExecuteWithRetryAndCachingAsync(cacheKey, _ => Task.Run(() => mockInstanceToRetry.Object.ToString()), CancellationToken.None); + var second = await cachingAsyncPolicy.ExecuteWithRetryAndCachingAsync(cacheKey, _ => Task.Run(() => mockInstanceToRetry.Object.ToString()), CancellationToken.None); mockInstanceToRetry.Verify(o => o.ToString(), Times.Once); Assert.AreEqual("foo", first); @@ -73,7 +86,7 @@ public async Task ExecuteWithRetryAndCachingAsync_ValueIsNotCachedOnFailureAndTh var cacheKey = Guid.NewGuid().ToString(); mockInstanceToRetry.Setup(o => o.ToString()).Throws(); - await Assert.ThrowsExceptionAsync(() => cachingRetryHandler.ExecuteWithRetryAndCachingAsync(cacheKey, _ => Task.Run(() => mockInstanceToRetry.Object.ToString()), CancellationToken.None)); + await Assert.ThrowsExceptionAsync(() => cachingAsyncPolicy.ExecuteWithRetryAndCachingAsync(cacheKey, _ => Task.Run(() => mockInstanceToRetry.Object.ToString()), CancellationToken.None)); Assert.IsFalse(appCache.TryGetValue(cacheKey, out string? _)); } @@ -92,7 +105,7 @@ public async Task ExecuteHttpRequestWithRetryAsync_RetriesOnlyOnExpectedFailureC mockFactory.Setup(f => f.CreateResponseAsync()).Returns(CreateResponseAsync(statusCode)); var response = - await cachingRetryHandler.ExecuteWithRetryAsync(_ => + await cachingAsyncHttpResponseMessagePolicy.ExecuteWithRetryAsync(_ => mockFactory.Object.CreateResponseAsync(), CancellationToken.None); diff --git a/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs b/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs index b7c23b052..b7e01d0d6 100644 --- a/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs +++ b/src/Tes.ApiClients.Tests/TerraLandingZoneApiClientTest.cs @@ -13,6 +13,8 @@ public class TerraLandingZoneApiClientTest private TerraLandingZoneApiClient terraLandingZoneApiClient = null!; private Mock tokenCredential = null!; private Mock cacheAndRetryHandler = null!; + private Lazy> asyncRetryPolicy = null!; + private Lazy>> asyncResponseRetryPolicy = null!; private TerraApiStubData terraApiStubData = null!; [TestInitialize] @@ -24,6 +26,8 @@ public void SetUp() var cache = new Mock(); cache.Setup(c => c.CreateEntry(It.IsAny())).Returns(new Mock().Object); cacheAndRetryHandler.SetupGet(c => c.AppCache).Returns(cache.Object); + asyncResponseRetryPolicy = new(TestServices.RetryHandlersHelpers.GetCachingHttpResponseMessageAsyncRetryPolicyMock(cacheAndRetryHandler)); + asyncRetryPolicy = new(TestServices.RetryHandlersHelpers.GetCachingAsyncRetryPolicyMock(cacheAndRetryHandler)); terraLandingZoneApiClient = new TerraLandingZoneApiClient(TerraApiStubData.LandingZoneApiHost, tokenCredential.Object, cacheAndRetryHandler.Object, NullLogger.Instance); } @@ -32,12 +36,12 @@ public async Task GetResourceQuotaAsync_ValidResourceIdReturnsQuotaInformationAn { var body = terraApiStubData.GetResourceQuotaApiResponseInJson(); - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync( - It.IsAny>>(), It.IsAny(), It.IsAny>())) + asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteAsync( + It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(new HttpResponseMessage()); - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync( - It.IsAny>>(), It.IsAny(), It.IsAny())) + asyncRetryPolicy.Value.Setup(c => c.ExecuteAsync( + It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(body); var quota = await terraLandingZoneApiClient.GetResourceQuotaAsync(terraApiStubData.LandingZoneId, terraApiStubData.BatchAccountId, cacheResults: true, cancellationToken: CancellationToken.None); @@ -63,12 +67,12 @@ public async Task GetLandingZoneResourcesAsync_ListOfLandingZoneResourcesAndGets { var body = terraApiStubData.GetResourceApiResponseInJson(); - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync( - It.IsAny>>(), It.IsAny(), It.IsAny>())) + asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteAsync( + It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(new HttpResponseMessage()); - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync( - It.IsAny>>(), It.IsAny(), It.IsAny())) + asyncRetryPolicy.Value.Setup(c => c.ExecuteAsync( + It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(body); var resources = await terraLandingZoneApiClient.GetLandingZoneResourcesAsync(terraApiStubData.LandingZoneId, CancellationToken.None); diff --git a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs index 2d4b62a09..49668b34c 100644 --- a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs +++ b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs @@ -6,8 +6,10 @@ using Azure.Core; using Microsoft.Extensions.Logging.Abstractions; using Moq; +using Polly; using Tes.ApiClients.Models.Terra; using TesApi.Web.Management.Models.Terra; +using static Tes.ApiClients.CachingRetryHandler; namespace Tes.ApiClients.Tests { @@ -17,6 +19,8 @@ public class TerraWsmApiClientTests private TerraWsmApiClient terraWsmApiClient = null!; private Mock tokenCredential = null!; private Mock cacheAndRetryHandler = null!; + private Lazy> asyncRetryPolicy = null!; + private Lazy>> asyncResponseRetryPolicy = null!; private TerraApiStubData terraApiStubData = null!; [TestInitialize] @@ -28,6 +32,8 @@ public void SetUp() var cache = new Mock(); cache.Setup(c => c.CreateEntry(It.IsAny())).Returns(new Mock().Object); cacheAndRetryHandler.SetupGet(c => c.AppCache).Returns(cache.Object); + asyncResponseRetryPolicy = new(TestServices.RetryHandlersHelpers.GetCachingHttpResponseMessageAsyncRetryPolicyMock(cacheAndRetryHandler)); + asyncRetryPolicy = new(TestServices.RetryHandlersHelpers.GetCachingAsyncRetryPolicyMock(cacheAndRetryHandler)); terraWsmApiClient = new TerraWsmApiClient(TerraApiStubData.WsmApiHost, tokenCredential.Object, cacheAndRetryHandler.Object, NullLogger.Instance); } @@ -92,11 +98,11 @@ public async Task GetSasTokenAsync_ValidRequest_ReturnsPayload() Content = new StringContent(terraApiStubData.GetWsmSasTokenApiResponseInJson()) }; - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny>())) + asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) - .Returns((Func> action, CancellationToken cancellationToken, RetryHandler.OnRetryHandler _2) => action(cancellationToken)); + asyncRetryPolicy.Value.Setup(c => c.ExecuteAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + .Returns((Func> action, Polly.Context context, CancellationToken cancellationToken) => action(context, cancellationToken)); var apiResponse = await terraWsmApiClient.GetSasTokenAsync(terraApiStubData.WorkspaceId, terraApiStubData.ContainerResourceId, null!, CancellationToken.None); @@ -114,11 +120,13 @@ public async Task GetContainerResourcesAsync_ValidRequest_ReturnsPayload() Content = new StringContent(terraApiStubData.GetContainerResourcesApiResponseInJson()) }; - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny>())) + asyncResponseRetryPolicy.Value + .Setup(c => c.ExecuteAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) - .Returns((Func> action, CancellationToken cancellationToken, RetryHandler.OnRetryHandler _2) => action(cancellationToken)); + asyncRetryPolicy.Value + .Setup(c => c.ExecuteAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + .Returns((Func> action, Polly.Context context, CancellationToken cancellationToken) => action(context, cancellationToken)); var apiResponse = await terraWsmApiClient.GetContainerResourcesAsync(terraApiStubData.WorkspaceId, offset: 0, limit: 10, CancellationToken.None); @@ -134,7 +142,8 @@ public async Task DeleteBatchPoolAsync_204Response_Succeeds() var wsmResourceId = Guid.NewGuid(); var response = new HttpResponseMessage(HttpStatusCode.NoContent); - cacheAndRetryHandler.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny>())) + asyncResponseRetryPolicy.Value + .Setup(c => c.ExecuteAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); await terraWsmApiClient.DeleteBatchPoolAsync(terraApiStubData.WorkspaceId, wsmResourceId, CancellationToken.None); diff --git a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers-HttpResponseMessage.cs b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers-HttpResponseMessage.cs new file mode 100644 index 000000000..9a8bdfc64 --- /dev/null +++ b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers-HttpResponseMessage.cs @@ -0,0 +1,41 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Moq; +using Polly.Retry; +using Polly; +using static Tes.ApiClients.CachingRetryHandler; +using static Tes.ApiClients.RetryHandler; + +namespace Tes.ApiClients.Tests.TestServices +{ + internal static partial class RetryHandlersHelpers + { + internal static Mock> GetCachingHttpResponseMessageAsyncRetryPolicyMock(Mock cachingRetryHandler) + { + var cachingAsyncRetryPolicy = new Mock>(); + _ = cachingAsyncRetryPolicy.As(); + var cachingAsyncPolicy = cachingAsyncRetryPolicy.As>(); + var cachingPolicy = cachingAsyncPolicy.As(); + _ = cachingAsyncRetryPolicy.As>(); + var cachingPolicyBuild = new Mock>(); + cachingPolicyBuild.Setup(policy => policy.BuildAsync()) + .Returns(cachingAsyncRetryPolicy.Object); + cachingRetryHandler.As().Setup(policy => policy.CachingPolicyBuilder(It.IsAny>())) + .Returns(cachingPolicyBuild.Object); + var builderBuild = new Mock>(); + var policyBuilderWait = new Mock>(); + policyBuilderWait.Setup(policy => policy.SetOnRetryBehavior(It.IsAny(), It.IsAny>(), It.IsAny>())) + .Returns(builderBuild.Object); + cachingRetryHandler.Setup(cachingRetryHandler => cachingRetryHandler.RetryDefaultHttpResponseMessagePolicyBuilder()) + .Returns(policyBuilderWait.Object); + builderBuild.Setup(policy => policy.PolicyBuilderBase) + .Returns(cachingRetryHandler.Object); + cachingPolicy.Setup(c => c.Handler) + .Returns(cachingRetryHandler.Object); + cachingPolicy.Setup(c => c.AppCache) + .Returns(cachingRetryHandler.Object.AppCache); + return cachingAsyncRetryPolicy; + } + } +} diff --git a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs new file mode 100644 index 000000000..9d64f2a70 --- /dev/null +++ b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs @@ -0,0 +1,44 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Moq; +using Polly.Retry; +using Polly; +using static Tes.ApiClients.CachingRetryHandler; +using static Tes.ApiClients.RetryHandler; + +namespace Tes.ApiClients.Tests.TestServices +{ + internal static partial class RetryHandlersHelpers + { + // TODO: Add ability to use a mocked ILogger with a mocked CachingRetryHandler where failures in the mocked retry handlers call the mocked ILogger. + // The opt-in would be an optional argument like this: "Microsoft.Extensions.Logging.ILogger logger". + + internal static Mock GetCachingAsyncRetryPolicyMock(Mock cachingRetryHandler) + { + var cachingAsyncRetryPolicy = new Mock(); + _ = cachingAsyncRetryPolicy.As(); + var cachingAsyncPolicy = cachingAsyncRetryPolicy.As(); + var cachingPolicy = cachingAsyncPolicy.As(); + _ = new Mock(); + var cachingPolicyBuild = new Mock(); + cachingPolicyBuild.Setup(policy => policy.BuildAsync()) + .Returns(cachingAsyncRetryPolicy.Object); + cachingRetryHandler.As().Setup(policy => policy.CachingPolicyBuilder(It.IsAny())) + .Returns(cachingPolicyBuild.Object); + var builderBuild = new Mock(); + var policyBuilderWait = new Mock(); + policyBuilderWait.Setup(policy => policy.SetOnRetryBehavior(It.IsAny(), It.IsAny(), It.IsAny())) + .Returns(builderBuild.Object); + cachingRetryHandler.Setup(cachingRetryHandler => cachingRetryHandler.RetryDefaultPolicyBuilder()) + .Returns(policyBuilderWait.Object); + builderBuild.Setup(policy => policy.PolicyBuilderBase) + .Returns(cachingRetryHandler.Object); + cachingPolicy.Setup(c => c.Handler) + .Returns(cachingRetryHandler.Object); + cachingPolicy.Setup(c => c.AppCache) + .Returns(cachingRetryHandler.Object.AppCache); + return cachingAsyncRetryPolicy; + } + } +} diff --git a/src/Tes.ApiClients/CachingRetryHandler.cs b/src/Tes.ApiClients/CachingRetryHandler.cs index e2cb2602f..50b837dae 100644 --- a/src/Tes.ApiClients/CachingRetryHandler.cs +++ b/src/Tes.ApiClients/CachingRetryHandler.cs @@ -3,14 +3,70 @@ using Microsoft.Extensions.Caching.Memory; using Microsoft.Extensions.Options; +using Polly; +using Polly.Retry; using Tes.ApiClients.Options; +using static Tes.ApiClients.CachingRetryHandler; namespace Tes.ApiClients { + /// + /// Extension methods for + /// + public static class CachingRetryHandlerExtensions + { + /// + /// Executes a delegate with the specified async retry policy and persisting the result in a cache. + /// + /// Asynchronous caching retry policy + /// + /// Action to execute + /// A for controlling the lifetime of the asynchronous operation. + /// Name of method originating the retriable operation. + /// + public static Task ExecuteWithRetryAndCachingAsync(this ICachingAsyncPolicy retryPolicy, string cacheKey, Func> action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + ArgumentNullException.ThrowIfNull(retryPolicy); + ValidateArgs(cacheKey, action); + + return retryPolicy.Handler.ExecuteWithCacheAsync(cacheKey, () => retryPolicy.ExecuteWithRetryAsync(action, cancellationToken, caller)); + } + + /// + /// Executes a delegate with the specified async retry policy and persisting the result in a cache. + /// + /// Asynchronous caching retry policy + /// + /// Action to execute + /// + /// A for controlling the lifetime of the asynchronous operation. + /// Name of method originating the retriable operation. + /// + /// + public static Task ExecuteWithRetryAndCachingAsync(this ICachingAsyncPolicy retryPolicy, string cacheKey, Func> action, DateTimeOffset cachesExpires, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + ArgumentNullException.ThrowIfNull(retryPolicy); + ValidateArgs(cacheKey, action); + + return retryPolicy.Handler.ExecuteWithCacheAsync(cacheKey, () => retryPolicy.ExecuteWithRetryAsync(action, cancellationToken, caller), cachesExpires); + } + + public static ICachingPolicyBuilderBuild AddCaching(this RetryHandler.IPolicyBuilderBuild policyBuilder) + { + return ((ICachingPolicyBuilderHandler)policyBuilder.PolicyBuilderBase).CachingPolicyBuilder(policyBuilder); + } + + public static ICachingPolicyBuilderBuild AddCaching(this RetryHandler.IPolicyBuilderBuild policyBuilder) + { + return ((ICachingPolicyBuilderHandler)policyBuilder.PolicyBuilderBase).CachingPolicyBuilder(policyBuilder); + } + } + + /// /// Contains an App Cache instances and retry policies. /// - public class CachingRetryHandler : RetryHandler + public partial class CachingRetryHandler : RetryHandler, ICachingPolicyBuilderHandler { private readonly IMemoryCache appCache = null!; @@ -36,40 +92,169 @@ public CachingRetryHandler(IMemoryCache appCache, IOptions r /// protected CachingRetryHandler() { } + #region Builder interfaces + public interface ICachingPolicy + { + IMemoryCache AppCache { get; } - /// - /// Executes a delegate with the specified async retry policy and persisting the result in a cache. - /// - /// - /// Action to execute - /// A for controlling the lifetime of the asynchronous operation. - /// to use. Defaults to none. - /// - public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, CancellationToken cancellationToken, OnRetryHandler onRetry = default) + /// Used internally and for testing. + public CachingRetryHandler Handler { get; } + } + + public interface ICachingSyncPolicy : ICachingPolicy, ISyncPolicy { } + public interface ICachingSyncPolicy : ICachingPolicy, ISyncPolicy { } + public interface ICachingAsyncPolicy : ICachingPolicy, IAsyncPolicy { } + public interface ICachingAsyncPolicy : ICachingPolicy, IAsyncPolicy { } + + public interface ICachingPolicyBuilderBuild { - ValidateArgs(cacheKey, action); + ICachingSyncPolicy Build(); + ICachingAsyncPolicy BuildAsync(); + } - return await ExecuteWithCacheAsync(cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken, onRetry)); + public interface ICachingPolicyBuilderBuild + { + ICachingSyncPolicy Build(); + ICachingAsyncPolicy BuildAsync(); } - /// - /// Executes a delegate with the specified async retry policy and persisting the result in a cache. - /// - /// - /// Action to execute - /// - /// A for controlling the lifetime of the asynchronous operation. - /// to use. Defaults to none. - /// - /// - public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, DateTimeOffset cachesExpires, CancellationToken cancellationToken, OnRetryHandler onRetry = default) + /// Used internally and for testing. + public interface ICachingPolicyBuilderHandler { - ValidateArgs(cacheKey, action); + ICachingPolicyBuilderBuild CachingPolicyBuilder(IPolicyBuilderBuild policyBuilder); + ICachingPolicyBuilderBuild CachingPolicyBuilder(IPolicyBuilderBuild policyBuilder); + } + #endregion + + #region Builder interface implementations + ICachingPolicyBuilderBuild ICachingPolicyBuilderHandler.CachingPolicyBuilder(IPolicyBuilderBuild policyBuilder) + => new CachingPolicyBuilderBuild(policyBuilder, this); + + ICachingPolicyBuilderBuild ICachingPolicyBuilderHandler.CachingPolicyBuilder(IPolicyBuilderBuild policyBuilder) + => new CachingPolicyBuilderBuild(policyBuilder, this); + + private readonly struct CachingPolicyBuilderBuild : ICachingPolicyBuilderBuild, ICachingPolicyBuilderBuild + { + private readonly IPolicyBuilderBuild policyBuilder; + private readonly IPolicyBuilderBuild genericPolicyBuilder; + private readonly CachingRetryHandler cachingHandler; + + public CachingPolicyBuilderBuild(IPolicyBuilderBuild policyBuilder, CachingRetryHandler handler) + { + ArgumentNullException.ThrowIfNull(policyBuilder); + this.policyBuilder = policyBuilder; + this.cachingHandler = handler; + } + + public CachingPolicyBuilderBuild(IPolicyBuilderBuild policyBuilder, CachingRetryHandler handler) + { + ArgumentNullException.ThrowIfNull(policyBuilder); + this.genericPolicyBuilder = policyBuilder; + this.cachingHandler = handler; + } + + public ICachingSyncPolicy Build() + { + return new CachingRetryPolicy(cachingHandler, policyBuilder.Build()); + } + + public ICachingAsyncPolicy BuildAsync() + { + return new CachingAsyncRetryPolicy(cachingHandler, policyBuilder.BuildAsync()); + } + + ICachingSyncPolicy ICachingPolicyBuilderBuild.Build() + { + return new CachingRetryPolicy(cachingHandler, genericPolicyBuilder.Build()); + } - return await ExecuteWithCacheAsync(cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken, onRetry), cachesExpires); + ICachingAsyncPolicy ICachingPolicyBuilderBuild.BuildAsync() + { + return new CachingAsyncRetryPolicy(cachingHandler, genericPolicyBuilder.BuildAsync()); + } + } + + private partial class CachingRetryPolicy : IRetryPolicy, ICachingSyncPolicy + { + [BeaKona.AutoInterface] + private readonly ISyncPolicy policy; + [BeaKona.AutoInterface] + private readonly IsPolicy retryPolicy; + private readonly CachingRetryHandler handler; + + public IMemoryCache AppCache => handler.AppCache; + CachingRetryHandler ICachingPolicy.Handler => handler; + + public CachingRetryPolicy(CachingRetryHandler handler, ISyncPolicy policy) + { + ArgumentNullException.ThrowIfNull(policy); + retryPolicy = (IRetryPolicy)policy; + this.policy = policy; + this.handler = handler; + } + } + + private partial class CachingRetryPolicy : IRetryPolicy, ICachingSyncPolicy + { + [BeaKona.AutoInterface] + private readonly ISyncPolicy policy; + [BeaKona.AutoInterface] + private readonly IsPolicy retryPolicy; + private readonly CachingRetryHandler handler; + + public IMemoryCache AppCache => handler.AppCache; + CachingRetryHandler ICachingPolicy.Handler => handler; + + public CachingRetryPolicy(CachingRetryHandler handler, ISyncPolicy policy) + { + ArgumentNullException.ThrowIfNull(policy); + retryPolicy = (IRetryPolicy)policy; + this.policy = policy; + this.handler = handler; + } + } + + private partial class CachingAsyncRetryPolicy : IRetryPolicy, ICachingAsyncPolicy + { + [BeaKona.AutoInterface] + private readonly IAsyncPolicy policy; + [BeaKona.AutoInterface] + private readonly IsPolicy retryPolicy; + private readonly CachingRetryHandler handler; + + public IMemoryCache AppCache => handler.AppCache; + CachingRetryHandler ICachingPolicy.Handler => handler; + + public CachingAsyncRetryPolicy(CachingRetryHandler handler, IAsyncPolicy policy) + { + retryPolicy = (IRetryPolicy)policy; + this.policy = policy; + this.handler = handler; + } + } + + private partial class CachingAsyncRetryPolicy : IRetryPolicy, ICachingAsyncPolicy + { + [BeaKona.AutoInterface] + private readonly IAsyncPolicy policy; + [BeaKona.AutoInterface] + private readonly IsPolicy retryPolicy; + private readonly CachingRetryHandler handler; + + public IMemoryCache AppCache => handler.AppCache; + CachingRetryHandler ICachingPolicy.Handler => handler; + + public CachingAsyncRetryPolicy(CachingRetryHandler handler, IAsyncPolicy policy) + { + ArgumentNullException.ThrowIfNull(policy); + retryPolicy = (IRetryPolicy)policy; + this.policy = policy; + this.handler = handler; + } } + #endregion - private static void ValidateArgs(string cacheKey, Func action) + internal static void ValidateArgs(string cacheKey, Func action) { ArgumentNullException.ThrowIfNull(action); @@ -79,10 +264,10 @@ private static void ValidateArgs(string cacheKey, Func } } - private async Task ExecuteWithCacheAsync(string cacheKey, Func> action) + internal async Task ExecuteWithCacheAsync(string cacheKey, Func> action) => await appCache.GetOrCreateAsync(cacheKey, _ => action()); - private async Task ExecuteWithCacheAsync(string cacheKey, Func> action, DateTimeOffset cacheExpires) + internal async Task ExecuteWithCacheAsync(string cacheKey, Func> action, DateTimeOffset cacheExpires) => await appCache.GetOrCreateAsync(cacheKey, entry => { entry.AbsoluteExpiration = cacheExpires; diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index 608c8a60e..aabda7124 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -18,7 +18,6 @@ public abstract class HttpApiClient { private static readonly HttpClient HttpClient = new(); private readonly TokenCredential tokenCredential = null!; - private readonly CachingRetryHandler cachingRetryHandler = null!; private readonly SHA256 sha256 = SHA256.Create(); /// /// Logger instance @@ -28,6 +27,9 @@ public abstract class HttpApiClient private readonly SemaphoreSlim semaphore = new(1, 1); private AccessToken accessToken; + protected readonly CachingRetryHandler.ICachingAsyncPolicy AsyncCachingHttpResponseMessageRetryPolicy; + protected readonly CachingRetryHandler.ICachingAsyncPolicy AsyncCachingRetryPolicy; + /// /// Inner http client. /// @@ -43,8 +45,19 @@ protected HttpApiClient(CachingRetryHandler cachingRetryHandler, ILogger logger) ArgumentNullException.ThrowIfNull(cachingRetryHandler); ArgumentNullException.ThrowIfNull(logger); - this.cachingRetryHandler = cachingRetryHandler; this.Logger = logger; + + AsyncCachingRetryPolicy = cachingRetryHandler + .RetryDefaultPolicyBuilder() + .SetOnRetryBehavior(logger: this.Logger) + .AddCaching() + .BuildAsync(); + + AsyncCachingHttpResponseMessageRetryPolicy = cachingRetryHandler + .RetryDefaultHttpResponseMessagePolicyBuilder() + .SetOnRetryBehavior(onRetry: LogRetryErrorOnRetryHttpResponseMessageHandler()) + .AddCaching() + .BuildAsync(); } /// @@ -72,10 +85,9 @@ protected HttpApiClient() { } /// /// A logging Polly retry handler. /// - /// Calling method name. /// - private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHttpResponseMessageHandler([System.Runtime.CompilerServices.CallerMemberName] string caller = default) - => new((result, timeSpan, retryCount, correlationId) => + private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHttpResponseMessageHandler() + => new((result, timeSpan, retryCount, correlationId, caller) => { if (result.Exception is null) { @@ -100,7 +112,7 @@ private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHtt protected async Task HttpSendRequestWithRetryPolicyAsync( Func httpRequestFactory, CancellationToken cancellationToken, bool setAuthorizationHeader = false) { - return await cachingRetryHandler.ExecuteWithRetryAsync(async ct => + return await AsyncCachingHttpResponseMessageRetryPolicy.ExecuteWithRetryAsync(async ct => { var request = httpRequestFactory(); if (setAuthorizationHeader) @@ -109,7 +121,7 @@ protected async Task HttpSendRequestWithRetryPolicyAsync( } return await HttpClient.SendAsync(request, ct); - }, cancellationToken, LogRetryErrorOnRetryHttpResponseMessageHandler()); + }, cancellationToken); } /// @@ -159,7 +171,7 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri re { var cacheKey = await ToCacheKeyAsync(requestUrl, setAuthorizationHeader, cancellationToken); - return (await cachingRetryHandler.AppCache.GetOrCreateAsync(cacheKey, + return (await AsyncCachingRetryPolicy.AppCache.GetOrCreateAsync(cacheKey, async _ => await HttpGetRequestWithRetryPolicyAsync(requestUrl, cancellationToken, setAuthorizationHeader)))!; } @@ -173,19 +185,19 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri re protected async Task HttpGetRequestWithRetryPolicyAsync(Uri requestUrl, CancellationToken cancellationToken, bool setAuthorizationHeader = false) { - return await cachingRetryHandler.ExecuteWithRetryAsync(async token => + return await AsyncCachingRetryPolicy.ExecuteWithRetryAsync(async token => { - var response = await cachingRetryHandler.ExecuteWithRetryAsync(async ct => + var response = await AsyncCachingHttpResponseMessageRetryPolicy.ExecuteWithRetryAsync(async ct => { //request must be recreated in every retry. var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); var httpResponse = await HttpClient.SendAsync(httpRequest, ct); return httpResponse.EnsureSuccessStatusCode(); - }, token, LogRetryErrorOnRetryHttpResponseMessageHandler()); + }, token); return await ReadResponseBodyAsync(response, token); - }, cancellationToken, RetryHandler.LogRetryErrorOnRetryHandler(Logger)); + }, cancellationToken); } /// diff --git a/src/Tes.ApiClients/RetryHandler.cs b/src/Tes.ApiClients/RetryHandler.cs index 689959570..5234d50b0 100644 --- a/src/Tes.ApiClients/RetryHandler.cs +++ b/src/Tes.ApiClients/RetryHandler.cs @@ -5,217 +5,716 @@ using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Polly; -using Polly.Extensions.Http; using Polly.Retry; using Tes.ApiClients.Options; namespace Tes.ApiClients; /// -/// Utility class that facilitates the retry policy implementations for HTTP clients. +/// Extension methods for /// -public class RetryHandler +public static class RetryHandlerExtensions { - private readonly RetryPolicy retryPolicy = null!; - private readonly AsyncRetryPolicy asyncRetryPolicy = null!; - private readonly AsyncRetryPolicy asyncHttpRetryPolicy = null!; - /// - /// An opinionated generic logging retry handler. + /// Executes a delegate with the specified policy. /// - /// to use. + /// Result type. + /// Synchronous retry policy. + /// Action to execute. /// Name of method originating the retriable operation. - /// - public static OnRetryHandler LogRetryErrorOnRetryHandler(ILogger logger, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + /// instance. + public static TResult ExecuteWithRetry(this ISyncPolicy retryPolicy, Func action, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) { - return new((exception, timeSpan, retryCount, correlationId) => - logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan} CorrelationId: {CorrelationId}", caller, retryCount, timeSpan, correlationId.ToString("D"))); + ArgumentNullException.ThrowIfNull(retryPolicy); + ArgumentNullException.ThrowIfNull(action); + + return retryPolicy.Execute(_ => action(), PrepareContext(caller)); } /// - /// The key in where or is stored. + /// Executes a delegate with the specified async policy. /// - public const string OnRetryHandlerKey = "OnRetryHandler"; + /// Result type. + /// Asynchronous retry policy. + /// Action to execute. + /// A for controlling the lifetime of the asynchronous operation. + /// Name of method originating the retriable operation. + /// instance. + public static Task ExecuteWithRetryAsync(this IAsyncPolicy retryPolicy, Func> action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + ArgumentNullException.ThrowIfNull(retryPolicy); + ArgumentNullException.ThrowIfNull(action); - /// - /// The action to call on each retry. - /// - /// The handled exception. - /// The current sleep duration. - /// The current retry count. It starts at 1 between the first handled condition and the first wait, then 2, etc. - /// A Guid guaranteed to be unique to each execution. Acts as a correlation id so that events specific to a single execution can be identified in logging and telemetry. - /// This is called right before the wait. - public delegate void OnRetryHandler(Exception outcome, TimeSpan timespan, int retryCount, Guid correlationId); + return retryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(caller), cancellationToken); + } - /// - /// The action to call on each retry. - /// - /// See . - /// The handled exception or result. - /// The current sleep duration. - /// The current retry count. It starts at 1 between the first handled condition and the first wait, then 2, etc. - /// A Guid guaranteed to be unique to each execution. Acts as a correlation id so that events specific to a single execution can be identified in logging and telemetry. - /// This is called right before the wait. - public delegate void OnRetryHandler(DelegateResult result, TimeSpan timespan, int retryCount, Guid correlationId); /// - /// Synchronous retry policy instance. + /// Executes a delegate with the specified async policy. /// - public virtual RetryPolicy RetryPolicy => retryPolicy; - - public RetryHandler(IOptions retryPolicyOptions) + /// Asynchronous retry policy. + /// Action to execute. + /// A for controlling the lifetime of the asynchronous operation. + /// Name of method originating the retriable operation. + /// + public static Task ExecuteWithRetryAsync(this IAsyncPolicy retryPolicy, Func action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) { - ArgumentNullException.ThrowIfNull(retryPolicyOptions); + ArgumentNullException.ThrowIfNull(retryPolicy); + ArgumentNullException.ThrowIfNull(action); - this.retryPolicy = Policy - .Handle() - .WaitAndRetry(retryPolicyOptions.Value.MaxRetryCount, - (attempt) => TimeSpan.FromSeconds(Math.Pow(retryPolicyOptions.Value.ExponentialBackOffExponent, - attempt)), OnRetry); - this.asyncRetryPolicy = Policy - .Handle() - .WaitAndRetryAsync(retryPolicyOptions.Value.MaxRetryCount, - (attempt) => TimeSpan.FromSeconds(Math.Pow(retryPolicyOptions.Value.ExponentialBackOffExponent, - attempt)), OnRetry); - this.asyncHttpRetryPolicy = HttpPolicyExtensions.HandleTransientHttpError() - .OrResult(r => r.StatusCode == HttpStatusCode.TooManyRequests) - .WaitAndRetryAsync(retryPolicyOptions.Value.MaxRetryCount, - (attempt) => TimeSpan.FromSeconds(Math.Pow(retryPolicyOptions.Value.ExponentialBackOffExponent, - attempt)), OnRetry); + return retryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(caller), cancellationToken); } - public static void OnRetry(DelegateResult result, TimeSpan span, int retryCount, Context ctx) + /// + /// Executes a delegate with the specified async policy. + /// + /// Asynchronous retry policy + /// Action to execute + /// A for controlling the lifetime of the asynchronous operation. + /// Name of method originating the retriable operation. + /// Result HttpResponse + public static Task ExecuteWithRetryAsync(this IAsyncPolicy retryPolicy, Func> action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) { - ctx.GetOnRetryHandler()?.Invoke(result, span, retryCount, ctx.CorrelationId); + ArgumentNullException.ThrowIfNull(retryPolicy); + ArgumentNullException.ThrowIfNull(action); + + return retryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(caller), cancellationToken); } - public static void OnRetry(Exception outcome, TimeSpan timespan, int retryCount, Context ctx) + private static Context PrepareContext(string caller) => new() { - ctx.GetOnRetryHandler()?.Invoke(outcome, timespan, retryCount, ctx.CorrelationId); - } + [RetryHandler.CallerMemberNameKey] = caller + }; +} - /// - /// Parameter-less constructor for mocking - /// - protected RetryHandler() { } +/// +/// Utility class that facilitates the retry policy implementations for HTTP clients. +/// +public class RetryHandler +{ + // Keys used by this implementation in Polly Contexts + public const string CallerMemberNameKey = $"Tes.ApiClients.{nameof(RetryHandler)}.CallerMemberName"; + public const string BackupSkipProvidedIncrementKey = $"Tes.ApiClients.{nameof(RetryHandler)}.BackupSkipProvidedIncrementCount"; /// - /// Asynchronous retry policy instance. + /// The main starting point for building retry policies /// - public virtual AsyncRetryPolicy AsyncRetryPolicy => asyncRetryPolicy; + public IPolicyBuilderPolicy PolicyBuilder => new PolicyBuilderPolicy(defaultOptions); /// - /// Configures to use if provided. + /// The default HttpResponseMessage retry policy builder. /// - /// Return type of asynchronous delegate. - /// to use. Defaults to none. - /// - public static Context PrepareContext(OnRetryHandler onRetry = default) - { - var context = new Context(); - - if (onRetry is not null) - { - context.SetOnRetryHandler(onRetry); - } + public static PolicyBuilder DefaultHttpResponseMessagePolicyBuilder => + Polly.Extensions.Http.HttpPolicyExtensions.HandleTransientHttpError() + .OrResult(r => r.StatusCode == HttpStatusCode.TooManyRequests); + + /// Shortcut starting point for testing. + public virtual IPolicyBuilderWait RetryDefaultPolicyBuilder() + => PolicyBuilder + .OpinionatedRetryPolicy() + .WithRetryPolicyOptionsWait(); + + /// Shortcut starting point for testing. + public virtual IPolicyBuilderWait RetryDefaultHttpResponseMessagePolicyBuilder() + => PolicyBuilder + .OpinionatedRetryPolicy(DefaultHttpResponseMessagePolicyBuilder) + .WithRetryPolicyOptionsWait(); - return context; + public RetryHandler(IOptions retryPolicyOptions) + { + ArgumentNullException.ThrowIfNull(retryPolicyOptions); + defaultOptions = new(this, retryPolicyOptions.Value); } /// - /// Configures to use if provided. + /// Parameter-less constructor for mocking /// - /// to use. Defaults to none. - /// for the retriable operation. - public static Context PrepareContext(OnRetryHandler onRetry = default) + protected RetryHandler() { } + + #region Builder interfaces + public interface IPolicyBuilderPolicy { - var context = new Context(); + /// + /// Default retry policy. + /// + /// retry policy builder. + IPolicyBuilderBase OpinionatedRetryPolicy(); + + /// + /// Custom retry policy. + /// + /// Builder class that holds the list of current exception predicates. + /// Custom retry policy builder. + IPolicyBuilderBase OpinionatedRetryPolicy(PolicyBuilder policyBuilder); + + /// + /// Generic retry policy. + /// + /// Result values. + /// Builder class that holds the list of current execution predicates filtering TResult result values. + /// Generic retry policy builder. + IPolicyBuilderBase OpinionatedRetryPolicy(PolicyBuilder policyBuilder); + } - if (onRetry is not null) - { - context.SetOnRetryHandler(onRetry); - } + public interface IPolicyBuilderBase + { + /// + /// Default wait policy. + /// + /// Wait policy. + IPolicyBuilderWait WithRetryPolicyOptionsWait(); + + /// + /// Custom exponential wait policy. + /// + /// Maximum number of retries. + /// Value in seconds which is raised by the power of the retry attempt. + /// Wait policy. + IPolicyBuilderWait WithExponentialBackoffWait(int maxRetryCount, double exponentialBackOffExponent); + + /// + /// Custom exception-based wait policy. + /// + /// Maximum number of retries. + /// Wait policy. + /// Wait policy. + IPolicyBuilderWait WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func waitDurationProvider); + + /// + /// Custom optional exception-based wait policy backed up by an exponential wait policy. + /// + /// Wait policy that can return to use the backup wait policy. + /// Maximum number of retries. + /// Value in seconds which is raised by the power of the backup retry attempt. + /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. + /// Wait policy. + IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, int maxRetryCount, int exponentialBackOffExponent, bool backupSkipProvidedIncrements); + + /// + /// Custom optional exception-based wait policy backed up by the default wait policy. + /// + /// Wait policy that can return to use the backup wait policy. + /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. + /// Wait policy. + IPolicyBuilderWait WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func waitDurationProvider, bool backupSkipProvidedIncrements); + } - return context; + public interface IPolicyBuilderBase + { + /// + /// Default wait policy. + /// + /// Wait policy. + IPolicyBuilderWait WithRetryPolicyOptionsWait(); + + /// + /// Custom exponential wait policy. + /// + /// Maximum number of retries. + /// Value in seconds which is raised by the power of the retry attempt. + /// Wait policy. + IPolicyBuilderWait WithExponentialBackoffWait(int retryCount, double exponentialBackOffExponent); + + /// + /// Custom result-based policy. + /// + /// Maximum number of retries. + /// Wait policy. + /// Wait policy. + /// + IPolicyBuilderWait WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func, TimeSpan> waitDurationProvider); + + /// + /// Custom exception-based wait policy. + /// + /// Maximum number of retries. + /// Wait policy. + /// Wait policy. + IPolicyBuilderWait WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func waitDurationProvider); + + /// + /// Custom optional exception-based wait policy backed up by an exponential wait policy. + /// + /// Wait policy that can return to use the backup wait policy. + /// Maximum number of retries. + /// Value in seconds which is raised by the power of the backup retry attempt. + /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. + /// Wait policy. + IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, int retryCount, int exponentialBackOffExponent, bool backupSkipProvidedIncrements); + + /// + /// Custom optional exception-based wait policy backed up by the default wait policy. + /// + /// Wait policy that can return to use the backup wait policy. + /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. + /// Wait policy. + IPolicyBuilderWait WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func waitDurationProvider, bool backupSkipProvidedIncrements); } - /// - /// Executes a delegate with the specified policy. - /// - /// Action to execute. - /// to use. Defaults to none. - /// Result type - /// Result instance - public TResult ExecuteWithRetry(Func action, OnRetryHandler onRetry = default) + public interface IPolicyBuilderWait { - ArgumentNullException.ThrowIfNull(action); + /// + /// OnRetry behaviors + /// + /// Logger to enable retry logging. + /// Custom onretry handler. + /// Custom async onretry handler. Only applies to . + /// OnRetry hander + IPolicyBuilderBuild SetOnRetryBehavior(ILogger logger = default, OnRetryHandler onRetry = default, OnRetryHandlerAsync onRetryAsync = default); + } - return retryPolicy.Execute(_ => action(), PrepareContext(onRetry)); + public interface IPolicyBuilderWait + { + /// + /// OnRetry behaviors + /// + /// Logger to enable retry logging. + /// Custom onretry handler. + /// Custom async onretry handler. Only applies to . + /// OnRetry hander + IPolicyBuilderBuild SetOnRetryBehavior(ILogger logger = default, OnRetryHandler onRetry = default, OnRetryHandlerAsync onRetryAsync = default); } - /// - /// Executes a delegate with the specified async policy. - /// - /// Action to execute - /// A for controlling the lifetime of the asynchronous operation. - /// to use. Defaults to none. - /// Result type - /// Result instance - public virtual Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, OnRetryHandler onRetry = default) + public interface IPolicyBuilderBuild { - ArgumentNullException.ThrowIfNull(action); + /// + /// Builds . + /// + /// Retry policy. + ISyncPolicy Build(); + + /// + /// Builds . + /// + /// Async retry policy. + IAsyncPolicy BuildAsync(); + + /// + /// Retrives the instance of the retryhandler to accomodate extensions to the builder + /// + RetryHandler PolicyBuilderBase { get; } + } - return asyncRetryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(onRetry), cancellationToken); + public interface IPolicyBuilderBuild + { + /// + /// Builds . + /// + /// Retry policy. + ISyncPolicy Build(); + + /// + /// Builds . + /// + /// Async retry policy. + IAsyncPolicy BuildAsync(); + + /// + /// Retrives the instance of the retryhandler to accomodate extensions to the builder + /// + RetryHandler PolicyBuilderBase { get; } } + #endregion + #region Delegates for custom on-retry handlers /// - /// Executes a delegate with the specified async policy. + /// The action to call on each retry. /// - /// Action to execute - /// A for controlling the lifetime of the asynchronous operation. - /// to use. Defaults to none. - /// Result instance - public async Task ExecuteWithRetryAsync(Func action, CancellationToken cancellationToken, OnRetryHandler onRetry = default) - { - ArgumentNullException.ThrowIfNull(action); + /// The handled exception. + /// The current sleep duration. + /// The current retry count. It starts at 1 between the first handled condition and the first wait, then 2, etc. + /// A Guid guaranteed to be unique to each execution. Acts as a correlation id so that events specific to a single execution can be identified in logging and telemetry. + /// Name of method originating the retriable operation. + /// This is called right before the wait. + public delegate void OnRetryHandler(Exception outcome, TimeSpan timespan, int retryCount, Guid correlationId, string caller); - await asyncRetryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(onRetry), cancellationToken); - } + /// + /// The action to call on each retry. + /// + /// The handled exception. + /// The current sleep duration. + /// The current retry count. It starts at 1 between the first handled condition and the first wait, then 2, etc. + /// A Guid guaranteed to be unique to each execution. Acts as a correlation id so that events specific to a single execution can be identified in logging and telemetry. + /// Name of method originating the retriable operation. + /// This is called right before the wait. + public delegate Task OnRetryHandlerAsync(Exception outcome, TimeSpan timespan, int retryCount, Guid correlationId, string caller); /// - /// Executes a delegate with the specified async policy. + /// The action to call on each retry. /// - /// Action to execute - /// A for controlling the lifetime of the asynchronous operation. - /// to use. Defaults to none. - /// Result HttpResponse - public virtual async Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, OnRetryHandler onRetry = default) - { - ArgumentNullException.ThrowIfNull(action); + /// See . + /// The handled exception or result. + /// The current sleep duration. + /// The current retry count. It starts at 1 between the first handled condition and the first wait, then 2, etc. + /// A Guid guaranteed to be unique to each execution. Acts as a correlation id so that events specific to a single execution can be identified in logging and telemetry. + /// Name of method originating the retriable operation. + /// This is called right before the wait. + public delegate void OnRetryHandler(DelegateResult result, TimeSpan timespan, int retryCount, Guid correlationId, string caller); - return await asyncHttpRetryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(onRetry), cancellationToken); - } -} + /// + /// The action to call on each retry. + /// + /// See . + /// The handled exception or result. + /// The current sleep duration. + /// The current retry count. It starts at 1 between the first handled condition and the first wait, then 2, etc. + /// A Guid guaranteed to be unique to each execution. Acts as a correlation id so that events specific to a single execution can be identified in logging and telemetry. + /// Name of method originating the retriable operation. + /// This is called right before the wait. + public delegate Task OnRetryHandlerAsync(DelegateResult result, TimeSpan timespan, int retryCount, Guid correlationId, string caller); + #endregion -public static class RetryHandlerExtensions -{ - public static void SetOnRetryHandler(this Context context, RetryHandler.OnRetryHandler onRetry) - { - context[RetryHandler.OnRetryHandlerKey] = onRetry; - } + #region Builder interface implementations + private readonly Defaults defaultOptions; - public static RetryHandler.OnRetryHandler GetOnRetryHandler(this Context context) + private readonly struct Defaults { - return context.TryGetValue(RetryHandler.OnRetryHandlerKey, out var handler) ? (RetryHandler.OnRetryHandler)handler : default; - } + public readonly RetryPolicyOptions PolicyOptions; + public readonly RetryHandler PolicyBuilderBase; - public static void SetOnRetryHandler(this Context context, RetryHandler.OnRetryHandler onRetry) - { - context[RetryHandler.OnRetryHandlerKey] = onRetry; + internal Defaults(RetryHandler retryHandler, RetryPolicyOptions options) + { + ArgumentNullException.ThrowIfNull(options); + PolicyOptions = options; + PolicyBuilderBase = retryHandler; + } } - public static RetryHandler.OnRetryHandler GetOnRetryHandler(this Context context) + private readonly struct PolicyBuilderPolicy : IPolicyBuilderPolicy { - return context.TryGetValue(RetryHandler.OnRetryHandlerKey, out var handler) ? (RetryHandler.OnRetryHandler)handler : default; + private readonly Defaults Defaults; + + internal PolicyBuilderPolicy(Defaults options) + { + ArgumentNullException.ThrowIfNull(options); + Defaults = options; + } + + /// + IPolicyBuilderBase IPolicyBuilderPolicy.OpinionatedRetryPolicy() + => new PolicyBuilderBase(Policy.Handle(), Defaults); + + /// + IPolicyBuilderBase IPolicyBuilderPolicy.OpinionatedRetryPolicy(PolicyBuilder policy) + => new PolicyBuilderBase(policy, Defaults); + + /// + IPolicyBuilderBase IPolicyBuilderPolicy.OpinionatedRetryPolicy(PolicyBuilder policy) + => new PolicyBuilderBase(policy, Defaults); + + private readonly struct PolicyBuilderBase : IPolicyBuilderBase, IPolicyBuilderBase + { + private readonly PolicyBuilder policyBuilder; + private readonly PolicyBuilder genericPolicyBuilder; + private readonly Defaults Defaults; + + private Func DefaultSleepDurationProvider() + => ExponentialSleepDurationProvider(Defaults.PolicyOptions.ExponentialBackOffExponent); + + private static Func ExponentialSleepDurationProvider(double exponentialBackOffExponent) + => (attempt, _1, _2) => TimeSpan.FromSeconds(Math.Pow(exponentialBackOffExponent, attempt)); + + private static Func ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(Func sleepDurationProvider, int exponentialBackOffExponent, bool backupSkipProvidedIncrements) + => (attempt, exception, ctx) => + { + return backupSkipProvidedIncrements + ? AdjustAttemptIfNeeded() + : sleepDurationProvider(attempt, exception) ?? ExponentialSleepDurationProvider(exponentialBackOffExponent)(attempt, exception, ctx); + + TimeSpan AdjustAttemptIfNeeded() + { + if (!ctx.TryGetValue(BackupSkipProvidedIncrementKey, out var value) || value is not int) + { + ctx[BackupSkipProvidedIncrementKey] = value = 0; + } + + var result = sleepDurationProvider(attempt, exception); + + if (result is null) + { + var skipIncrement = (int)value; + attempt -= skipIncrement; + ctx[BackupSkipProvidedIncrementKey] = ++skipIncrement; + result = ExponentialSleepDurationProvider(exponentialBackOffExponent)(attempt, exception, ctx); + } + + return result.Value; + } + }; + + internal PolicyBuilderBase(PolicyBuilder policyBuilder, Defaults defaults) + { + ArgumentNullException.ThrowIfNull(policyBuilder); + ArgumentNullException.ThrowIfNull(defaults); + this.policyBuilder = policyBuilder; + Defaults = defaults; + } + + internal PolicyBuilderBase(PolicyBuilder policyBuilder, Defaults defaults) + { + ArgumentNullException.ThrowIfNull(policyBuilder); + ArgumentNullException.ThrowIfNull(defaults); + this.genericPolicyBuilder = policyBuilder; + Defaults = defaults; + } + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithRetryPolicyOptionsWait() + => new PolicyBuilderWait(policyBuilder, Defaults.PolicyBuilderBase, Defaults.PolicyOptions.MaxRetryCount, DefaultSleepDurationProvider()); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func sleepDurationProvider) + => new PolicyBuilderWait(policyBuilder, Defaults.PolicyBuilderBase, maxRetryCount, (attempt, outcome, _1) => sleepDurationProvider(attempt, outcome)); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithExponentialBackoffWait(int retryCount, double exponentialBackOffExponent) + => new PolicyBuilderWait(policyBuilder, Defaults.PolicyBuilderBase, retryCount, ExponentialSleepDurationProvider(exponentialBackOffExponent)); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func sleepDurationProvider, bool backupSkipProvidedIncrements) + => new PolicyBuilderWait(policyBuilder, Defaults.PolicyBuilderBase, Defaults.PolicyOptions.MaxRetryCount, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, Defaults.PolicyOptions.ExponentialBackOffExponent, backupSkipProvidedIncrements)); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithExponentialBackoffBackup(Func sleepDurationProvider, int retryCount, int exponentialBackOffExponent, bool backupSkipProvidedIncrements) + => new PolicyBuilderWait(policyBuilder, Defaults.PolicyBuilderBase, retryCount, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, exponentialBackOffExponent, backupSkipProvidedIncrements)); + + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithRetryPolicyOptionsWait() + => new PolicyBuilderWait(genericPolicyBuilder, Defaults.PolicyBuilderBase, Defaults.PolicyOptions.MaxRetryCount, default, DefaultSleepDurationProvider()); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func waitDurationProvider) + => new PolicyBuilderWait(genericPolicyBuilder, Defaults.PolicyBuilderBase, maxRetryCount, default, (attempt, outcome, _1) => waitDurationProvider(attempt, outcome)); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func, TimeSpan> sleepDurationProvider) + => new PolicyBuilderWait(genericPolicyBuilder, Defaults.PolicyBuilderBase, maxRetryCount, (attempt, outcome, _1) => sleepDurationProvider(attempt, outcome), default); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithExponentialBackoffWait(int retryCount, double exponentialBackOffExponent) + => new PolicyBuilderWait(genericPolicyBuilder, Defaults.PolicyBuilderBase, retryCount, default, ExponentialSleepDurationProvider(exponentialBackOffExponent)); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func sleepDurationProvider, bool backupSkipProvidedIncrements) + => new PolicyBuilderWait(genericPolicyBuilder, Defaults.PolicyBuilderBase, Defaults.PolicyOptions.MaxRetryCount, default, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, Defaults.PolicyOptions.ExponentialBackOffExponent, backupSkipProvidedIncrements)); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithExponentialBackoffBackup(Func sleepDurationProvider, int retryCount, int exponentialBackOffExponent, bool backupSkipProvidedIncrements) + => new PolicyBuilderWait(genericPolicyBuilder, Defaults.PolicyBuilderBase, retryCount, default, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, exponentialBackOffExponent, backupSkipProvidedIncrements)); + } + + private readonly struct PolicyBuilderWait : IPolicyBuilderWait, IPolicyBuilderWait + { + private readonly PolicyBuilder policyBuilder; + private readonly PolicyBuilder genericPolicyBuilder; + private readonly Func sleepDurationProvider; + private readonly Func, Context, TimeSpan> genericSleepDurationProvider; + private readonly int retryCount; + private readonly RetryHandler builderBase; + + private static Func, Context, TimeSpan> PickSetProvider(Func, Context, TimeSpan> tResultProvider, Func exceptionProvider) + => (attempt, outcome, ctx) => tResultProvider is null ? exceptionProvider(attempt, outcome.Exception, ctx) : tResultProvider(attempt, outcome, ctx); + + internal PolicyBuilderWait(PolicyBuilder policyBuilder, RetryHandler builderBase, int retryCount, Func sleepDurationProvider) + { + ArgumentNullException.ThrowIfNull(sleepDurationProvider); + this.policyBuilder = policyBuilder; + this.retryCount = retryCount; + this.sleepDurationProvider = sleepDurationProvider; + this.builderBase = builderBase; + } + + internal PolicyBuilderWait(PolicyBuilder policyBuilder, RetryHandler builderBase, int retryCount, Func, Context, TimeSpan> sleepDurationProviderResult, Func sleepDurationProviderException) + { + if (sleepDurationProviderException is null && sleepDurationProviderResult is null) + { + throw new ArgumentNullException(); + } + + this.genericPolicyBuilder = policyBuilder; + this.retryCount = retryCount; + this.sleepDurationProvider = sleepDurationProviderException; + this.genericSleepDurationProvider = sleepDurationProviderResult; + this.builderBase = builderBase; + } + + /// + IPolicyBuilderBuild IPolicyBuilderWait.SetOnRetryBehavior(ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) + => new PolicyBuilderBuild(policyBuilder, builderBase, retryCount, sleepDurationProvider, logger, onRetry, onRetryAsync); + + /// + IPolicyBuilderBuild IPolicyBuilderWait.SetOnRetryBehavior(ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) + => new PolicyBuilderBuild(genericPolicyBuilder, builderBase, retryCount, PickSetProvider(genericSleepDurationProvider, sleepDurationProvider), logger, onRetry, onRetryAsync); + } + + private readonly struct PolicyBuilderBuild : IPolicyBuilderBuild, IPolicyBuilderBuild + { + private readonly PolicyBuilder policyBuilder; + private readonly PolicyBuilder genericPolicyBuilder; + private readonly Func sleepDurationProvider; + private readonly Func, Context, TimeSpan> genericSleepDurationProvider; + private readonly int retryCount; + private readonly ILogger logger; + private readonly OnRetryHandler onRetryHandler; + private readonly OnRetryHandlerAsync onRetryHandlerAsync; + private readonly OnRetryHandler genericOnRetryHandler; + private readonly OnRetryHandlerAsync genericOnRetryHandlerAsync; + + /// + public RetryHandler PolicyBuilderBase { get; } + + internal PolicyBuilderBuild(PolicyBuilder policyBuilder, RetryHandler builderBase, int retryCount, Func sleepDurationProvider, ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) + { + ArgumentNullException.ThrowIfNull(sleepDurationProvider); + this.policyBuilder = policyBuilder; + this.retryCount = retryCount; + this.sleepDurationProvider = sleepDurationProvider; + this.logger = logger; + this.onRetryHandler = onRetry; + this.onRetryHandlerAsync = onRetryAsync; + this.PolicyBuilderBase = builderBase; + } + + internal PolicyBuilderBuild(PolicyBuilder policyBuilder, RetryHandler builderBase, int retryCount, Func, Context, TimeSpan> sleepDurationProvider, ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) + { + ArgumentNullException.ThrowIfNull(sleepDurationProvider); + this.genericPolicyBuilder = policyBuilder; + this.retryCount = retryCount; + this.genericSleepDurationProvider = sleepDurationProvider; + this.logger = logger; + this.genericOnRetryHandler = onRetry; + this.genericOnRetryHandlerAsync = onRetryAsync; + this.PolicyBuilderBase = builderBase; + } + + private Action Logger() + { + var logger = this.logger; + + return (exception, timeSpan, retryCount, ctx) => + logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan:c} CorrelationId: {CorrelationId:D} ErrorMessage: {ExceptionMessage}", ctx[CallerMemberNameKey], retryCount, timeSpan, ctx.CorrelationId, exception.Message); + } + + private Action, TimeSpan, int, Context> GenericLogger() + { + var logger = this.logger; + + return (outcome, timeSpan, retryCount, ctx) => + { + if (outcome.Exception is null) + { + logger?.LogError(@"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan:c} CorrelationId: {CorrelationId:D}", ctx[CallerMemberNameKey], retryCount, timeSpan, ctx.CorrelationId); + } + else + { + logger?.LogError(outcome.Exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan:c} CorrelationId: {CorrelationId:D} ErrorMessage: {ExceptionMessage}", ctx[CallerMemberNameKey], retryCount, timeSpan, ctx.CorrelationId, outcome.Exception.Message); + } + }; + } + + private Action OnRetryHandler() + { + var handler = onRetryHandler ?? new((exception, timeSpan, retryCount, correlationId, caller) => { }); + var logger = Logger(); + + return (exception, timeSpan, retryCount, ctx) => + { + handler(exception, timeSpan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); + logger(exception, timeSpan, retryCount, ctx); + }; + } + + private Action, TimeSpan, int, Context> OnGenericRetryHandler() + { + var handler = onRetryHandler ?? new((exception, timeSpan, retryCount, correlationId, caller) => { }); + var genericHandler = genericOnRetryHandler ?? new((outcome, timeSpan, retryCount, correlationId, caller) => handler(outcome.Exception, timeSpan, retryCount, correlationId, caller)); + var logger = GenericLogger(); + + return (outcome, timeSpan, retryCount, ctx) => + { + genericHandler(outcome, timeSpan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); + logger(outcome, timeSpan, retryCount, ctx); + }; + } + + private Func OnRetryHandlerAsync() + { + var handler = onRetryHandler ?? new((exception, timeSpan, retryCount, correlationId, caller) => { }); + var handlerAsync = onRetryHandlerAsync ?? new((exception, timeSpan, retryCount, correlationId, caller) => + { + handler(exception, timeSpan, retryCount, correlationId, caller); + return Task.CompletedTask; + }); + var logger = Logger(); + + return async (exception, timeSpan, retryCount, ctx) => + { + await handlerAsync(exception, timeSpan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); + logger(exception, timeSpan, retryCount, ctx); + }; + } + + private Func, TimeSpan, int, Context, Task> OnGenericRetryHandlerAsync() + { + var handler = onRetryHandler ?? new((exception, timeSpan, retryCount, correlationId, caller) => { }); + var genericHandler = genericOnRetryHandler ?? new((outcome, timeSpan, retryCount, correlationId, caller) => handler(outcome.Exception, timeSpan, retryCount, correlationId, caller)); + var handlerAsync = onRetryHandlerAsync; + var genericHandlerAsync = genericOnRetryHandlerAsync ?? new((outcome, timespan, retryCount, correlationId, caller) => + { + if (handlerAsync is null) + { + genericHandler(outcome, timespan, retryCount, correlationId, caller); + return Task.CompletedTask; + } + + return handlerAsync(outcome.Exception, timespan, retryCount, correlationId, caller); + }); + var logger = GenericLogger(); + + return async (outcome, timespan, retryCount, ctx) => + { + await genericHandlerAsync(outcome, timespan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); + logger(outcome, timespan, retryCount, ctx); + }; + } + + /// + ISyncPolicy IPolicyBuilderBuild.Build() + { + var waitProvider = sleepDurationProvider; + var onRetryProvider = OnRetryHandler(); + + return policyBuilder.WaitAndRetry(retryCount, (attempt, ctx) => waitProvider(attempt, default, ctx), onRetryProvider); + } + + /// + IAsyncPolicy IPolicyBuilderBuild.BuildAsync() + { + var waitProvider = sleepDurationProvider; + var onRetryProvider = OnRetryHandlerAsync(); + + return policyBuilder.WaitAndRetryAsync(retryCount, waitProvider, onRetryProvider); + } + + /// + ISyncPolicy IPolicyBuilderBuild.Build() + { + var waitProvider = genericSleepDurationProvider; + var onRetryProvider = OnGenericRetryHandler(); + + return genericPolicyBuilder.WaitAndRetry(retryCount, waitProvider, onRetryProvider); + } + + /// + IAsyncPolicy IPolicyBuilderBuild.BuildAsync() + { + var waitProvider = genericSleepDurationProvider; + var onRetryProvider = OnGenericRetryHandlerAsync(); + + return genericPolicyBuilder.WaitAndRetryAsync(retryCount, waitProvider, onRetryProvider); + } + } + #endregion } } diff --git a/src/Tes.ApiClients/Tes.ApiClients.csproj b/src/Tes.ApiClients/Tes.ApiClients.csproj index b60a6b034..84816dba0 100644 --- a/src/Tes.ApiClients/Tes.ApiClients.csproj +++ b/src/Tes.ApiClients/Tes.ApiClients.csproj @@ -9,6 +9,7 @@ + @@ -20,4 +21,13 @@ + + + <_Parameter1>Tes.ApiClients.Tests + + + <_Parameter1>DynamicProxyGenAssembly2 + + + diff --git a/src/Tes.Runner/Docker/DockerExecutor.cs b/src/Tes.Runner/Docker/DockerExecutor.cs index 9e2ad3967..4ccc126bd 100644 --- a/src/Tes.Runner/Docker/DockerExecutor.cs +++ b/src/Tes.Runner/Docker/DockerExecutor.cs @@ -16,8 +16,10 @@ public class DockerExecutor { private readonly IDockerClient dockerClient = null!; private readonly ILogger logger = PipelineLoggerFactory.Create(); - private readonly NetworkUtility networkUtility = new NetworkUtility(); - private readonly RetryHandler retryHandler = new RetryHandler(Options.Create(new RetryPolicyOptions())); + private readonly NetworkUtility networkUtility = new(); + private readonly Polly.IAsyncPolicy asyncRetryPolicy = + new RetryHandler(Options.Create(new RetryPolicyOptions())) + .RetryDefaultPolicyBuilder().SetOnRetryBehavior().BuildAsync(); private readonly IStreamLogReader streamLogReader = null!; const int LogStreamingMaxWaitTimeInSeconds = 30; @@ -127,7 +129,7 @@ private async Task PullImageWithRetriesAsync(string imageName, string? tag, Auth { logger.LogInformation($"Pulling image name: {imageName} image tag: {tag}"); - await retryHandler.AsyncRetryPolicy.ExecuteAsync(async () => + await asyncRetryPolicy.ExecuteAsync(async () => { await dockerClient.Images.CreateImageAsync( new ImagesCreateParameters() { FromImage = imageName, Tag = tag }, diff --git a/src/TesApi.Tests/ContainerRegistryProviderTests.cs b/src/TesApi.Tests/ContainerRegistryProviderTests.cs index 77e5af610..82e84e5a3 100644 --- a/src/TesApi.Tests/ContainerRegistryProviderTests.cs +++ b/src/TesApi.Tests/ContainerRegistryProviderTests.cs @@ -22,7 +22,7 @@ public class ContainerRegistryProviderTests { private ContainerRegistryProvider containerRegistryProvider; private ContainerRegistryOptions containerRegistryOptions; - private Mock retryHandlerMock; + private Lazy> asyncRetryPolicy = null!; private Mock appCacheMock; private Mock> containerRegistryOptionsMock; private Mock> loggerMock; @@ -34,13 +34,14 @@ public class ContainerRegistryProviderTests public void Setup() { appCacheMock = new Mock(); - retryHandlerMock = new Mock(); + var retryHandlerMock = new Mock(); retryHandlerMock.Setup(r => r.AppCache).Returns(appCacheMock.Object); clientFactoryMock = new Mock(); containerRegistryOptionsMock = new Mock>(); containerRegistryOptions = new ContainerRegistryOptions(); containerRegistryOptionsMock.Setup(o => o.Value).Returns(containerRegistryOptions); loggerMock = new Mock>(); + asyncRetryPolicy = new(Tes.ApiClients.Tests.TestServices.RetryHandlersHelpers.GetCachingAsyncRetryPolicyMock(retryHandlerMock)); containerRegistryProvider = new ContainerRegistryProvider(containerRegistryOptionsMock.Object, retryHandlerMock.Object, clientFactoryMock.Object, loggerMock.Object); } @@ -50,11 +51,11 @@ public async Task GetContainerRegistryInfoAsync_ServerIsAccessible_ReturnsAndAdd { var server = "registry.com"; var image = $"{server}/image"; - retryHandlerMock.Setup(r => - r.ExecuteWithRetryAsync(It.IsAny>>>(), It.IsAny(), It.IsAny())) + asyncRetryPolicy.Value.Setup(r => + r.ExecuteAsync(It.IsAny>>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(new List() { - new ContainerRegistryInfo() { RegistryServer = server } + new() { RegistryServer = server } }); appCacheMock.Setup(c => c.CreateEntry(It.IsAny())) .Returns(new Mock().Object); @@ -80,8 +81,8 @@ public async Task GetContainerRegistryInfoAsync_ServerInCache_ReturnsRegistryInf Assert.IsNotNull(container); Assert.AreEqual(server, container.RegistryServer); appCacheMock.Verify(c => c.TryGetValue(It.Is(v => $"{nameof(ContainerRegistryProvider)}:{image}".Equals(v)), out It.Ref.IsAny), Times.Once()); - retryHandlerMock.Verify(r => - r.ExecuteWithRetryAsync(It.IsAny>>>(), It.IsAny(), It.IsAny()), Times.Never); + asyncRetryPolicy.Value.Verify(r => + r.ExecuteAsync(It.IsAny>>>(), It.IsAny(), It.IsAny()), Times.Never); } [TestMethod] @@ -100,13 +101,13 @@ public async Task GetContainerRegistryInfoAsync_NoAccessibleServerNoServerCached { var server = "registry"; var image = $"{server}_other/image"; - retryHandlerMock.Setup(r => - r.ExecuteWithRetryAsync( - It.IsAny>>>(), - It.IsAny(), It.IsAny())) + asyncRetryPolicy.Value.Setup(r => + r.ExecuteAsync( + It.IsAny>>>(), + It.IsAny(), It.IsAny())) .ReturnsAsync(new List() { - new ContainerRegistryInfo() { RegistryServer = server } + new() { RegistryServer = server } }); var container = await containerRegistryProvider.GetContainerRegistryInfoAsync(image, System.Threading.CancellationToken.None); diff --git a/src/TesApi.Tests/StartupTests.cs b/src/TesApi.Tests/StartupTests.cs index d71df0ba5..30f6717bd 100644 --- a/src/TesApi.Tests/StartupTests.cs +++ b/src/TesApi.Tests/StartupTests.cs @@ -79,7 +79,7 @@ private void ConfigureTerraOptions() } [TestMethod] - public void ConfigureServices_TerraOptionsAreConfigured_TerraStorageProvidedIsResolved() + public void ConfigureServices_TerraOptionsAreConfigured_TerraStorageProviderIsResolved() { ConfigureTerraOptions(); diff --git a/src/TesApi.Tests/TesApi.Tests.csproj b/src/TesApi.Tests/TesApi.Tests.csproj index 5b80faf58..73a8ebf1d 100644 --- a/src/TesApi.Tests/TesApi.Tests.csproj +++ b/src/TesApi.Tests/TesApi.Tests.csproj @@ -5,6 +5,10 @@ false + + + + diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 657e7d979..b3e500d34 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -23,9 +23,7 @@ using Microsoft.WindowsAzure.Storage.Blob; using Newtonsoft.Json; using Polly; -using Polly.Retry; using Tes.ApiClients; -using Tes.ApiClients.Options; using Tes.Models; using TesApi.Web.Extensions; using TesApi.Web.Management.Batch; @@ -49,7 +47,8 @@ namespace TesApi.Web public partial class AzureProxy : IAzureProxy { private const char BatchJobAttemptSeparator = '-'; - private readonly AsyncRetryPolicy batchRetryPolicy; + private readonly IAsyncPolicy batchRetryPolicyWhenJobNotFound; + private readonly IAsyncPolicy batchRetryPolicyWhenNodeNotReady; private readonly ILogger logger; private readonly BatchClient batchClient; @@ -66,14 +65,14 @@ public partial class AzureProxy : IAzureProxy /// /// The Azure Batch Account options /// - /// Retry policy options + /// Retry builder /// The logger /// - public AzureProxy(IOptions batchAccountOptions, IBatchPoolManager batchPoolManager, IOptions retryPolicyOptions, ILogger logger) + public AzureProxy(IOptions batchAccountOptions, IBatchPoolManager batchPoolManager, RetryHandler retryHandler, ILogger logger) { ArgumentNullException.ThrowIfNull(batchAccountOptions); ArgumentNullException.ThrowIfNull(batchPoolManager); - ArgumentNullException.ThrowIfNull(retryPolicyOptions); + ArgumentNullException.ThrowIfNull(retryHandler); ArgumentNullException.ThrowIfNull(logger); this.batchPoolManager = batchPoolManager; @@ -85,11 +84,17 @@ public AzureProxy(IOptions batchAccountOptions, IBatchPoolM throw new InvalidOperationException("The batch account name is missing from the the configuration."); } - batchRetryPolicy = Policy - .Handle() - .WaitAndRetryAsync(retryPolicyOptions.Value.MaxRetryCount, - (attempt, exception, ctx) => (exception as BatchException)?.RequestInformation?.RetryAfter ?? TimeSpan.FromSeconds(Math.Pow(retryPolicyOptions.Value.ExponentialBackOffExponent, attempt)), - (outcome, timespan, retryCount, ctx) => { RetryHandler.OnRetry(outcome, timespan, retryCount, ctx); return Task.CompletedTask; }); + var builder = retryHandler.PolicyBuilder + .OpinionatedRetryPolicy(Policy.Handle()) + .WithExceptionBasedWaitWithRetryPolicyOptionsBackup((attempt, exception) => (exception as BatchException)?.RequestInformation?.RetryAfter, true); + + batchRetryPolicyWhenJobNotFound = builder + .SetOnRetryBehavior(onRetry: OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenJobNotFound(LogRetryErrorOnRetryHandler())) + .BuildAsync(); + + batchRetryPolicyWhenNodeNotReady = builder + .SetOnRetryBehavior(onRetry: OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenNodeNotReady(LogRetryErrorOnRetryHandler())) + .BuildAsync(); if (!string.IsNullOrWhiteSpace(batchAccountOptions.Value.AppKey)) { @@ -126,13 +131,13 @@ public AzureProxy(IOptions batchAccountOptions, IBatchPoolM /// Polly retry handler. /// private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenJobNotFound(RetryHandler.OnRetryHandler OnRetry) - => new((outcome, timespan, retryCount, correlationId) => + => new((outcome, timespan, retryCount, correlationId, caller) => { if (outcome is BatchException batchException && batchException.RequestInformation?.BatchError?.Code != BatchErrorCodeStrings.JobNotFound) { System.Runtime.ExceptionServices.ExceptionDispatchInfo.Capture(outcome).Throw(); } - OnRetry?.Invoke(outcome, timespan, retryCount, correlationId); + OnRetry?.Invoke(outcome, timespan, retryCount, correlationId, caller); }); /// @@ -141,22 +146,21 @@ private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatch /// Polly retry handler. /// private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenNodeNotReady(RetryHandler.OnRetryHandler OnRetry) - => new((outcome, timespan, retryCount, correlationId) => + => new((outcome, timespan, retryCount, correlationId, caller) => { if (outcome is BatchException batchException && !"NodeNotReady".Equals(batchException.RequestInformation?.BatchError?.Code, StringComparison.InvariantCultureIgnoreCase)) { System.Runtime.ExceptionServices.ExceptionDispatchInfo.Capture(outcome).Throw(); } - OnRetry?.Invoke(outcome, timespan, retryCount, correlationId); + OnRetry?.Invoke(outcome, timespan, retryCount, correlationId, caller); }); /// /// A logging Polly retry handler. /// - /// Calling method name. /// - private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler([System.Runtime.CompilerServices.CallerMemberName] string caller = default) - => new((exception, timeSpan, retryCount, correlationId) => + private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler() + => new((exception, timeSpan, retryCount, correlationId, caller) => { var requestId = (exception as BatchException)?.RequestInformation?.ServiceRequestId ?? "n/a"; var reason = (exception.InnerException as Microsoft.Azure.Batch.Protocol.Models.BatchErrorException)?.Response?.ReasonPhrase ?? "n/a"; @@ -268,9 +272,8 @@ public async Task CreateAutoPoolModeBatchJobAsync(string jobId, CloudTask cloudT try { logger.LogInformation($"TES task: {cloudTask.Id} adding task to job."); - job = await batchRetryPolicy.ExecuteAsync((_, ct) => + job = await batchRetryPolicyWhenJobNotFound.ExecuteAsync(ct => batchClient.JobOperations.GetJobAsync(job.Id, cancellationToken: ct), - RetryHandler.PrepareContext(OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenJobNotFound(LogRetryErrorOnRetryHandler())), cancellationToken); await job.AddTaskAsync(cloudTask, cancellationToken: cancellationToken); @@ -315,9 +318,8 @@ public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, PoolI ArgumentException.ThrowIfNullOrEmpty(poolInformation?.PoolId, nameof(poolInformation)); logger.LogInformation("TES task: {TesTask} - Adding task to job {BatchJob}", tesTaskId, poolInformation.PoolId); - var job = await batchRetryPolicy.ExecuteAsync((_, ct) => + var job = await batchRetryPolicyWhenJobNotFound.ExecuteAsync(ct => batchClient.JobOperations.GetJobAsync(poolInformation.PoolId, cancellationToken: ct), - RetryHandler.PrepareContext(OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenJobNotFound(LogRetryErrorOnRetryHandler())), cancellationToken); await job.AddTaskAsync(cloudTask, cancellationToken: cancellationToken); @@ -562,9 +564,7 @@ public async Task DeleteBatchTaskAsync(string tesTaskId, PoolInformation pool, C foreach (var task in batchTasksToDelete) { logger.LogInformation("Deleting task {BatchTask}", task.Id); - await batchRetryPolicy.ExecuteAsync((_, ct) => task.DeleteAsync(cancellationToken: ct), - RetryHandler.PrepareContext(OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenNodeNotReady(LogRetryErrorOnRetryHandler())), - cancellationToken); + await batchRetryPolicyWhenNodeNotReady.ExecuteAsync(ct => task.DeleteAsync(cancellationToken: ct), cancellationToken); } } diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 37ca9846d..beca43672 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -11,7 +11,6 @@ using Microsoft.Azure.Batch.Common; using Microsoft.Extensions.Caching.Memory; using Microsoft.Extensions.Logging; -using Polly; using Tes.ApiClients; using TesApi.Web.Extensions; using TesApi.Web.Storage; @@ -26,7 +25,10 @@ public class CachingWithRetriesAzureProxy : IAzureProxy { private readonly ILogger logger; private readonly IAzureProxy azureProxy; - private readonly CachingRetryHandler cachingRetryHandler; + private readonly CachingRetryHandler.ICachingSyncPolicy cachingRetry; + private readonly CachingRetryHandler.ICachingAsyncPolicy cachingAsyncRetry; + private readonly CachingRetryHandler.ICachingAsyncPolicy cachingAsyncRetryExceptWhenExists; + private readonly CachingRetryHandler.ICachingAsyncPolicy cachingAsyncRetryExceptWhenNotFound; /// /// Contructor to create a cache of @@ -39,9 +41,18 @@ public CachingWithRetriesAzureProxy(IAzureProxy azureProxy, CachingRetryHandler ArgumentNullException.ThrowIfNull(azureProxy); ArgumentNullException.ThrowIfNull(cachingRetryHandler); - this.cachingRetryHandler = cachingRetryHandler; this.azureProxy = azureProxy; this.logger = logger; + + var builder = cachingRetryHandler.RetryDefaultPolicyBuilder(); + this.cachingRetry = builder.SetOnRetryBehavior(this.logger).AddCaching().Build(); + this.cachingAsyncRetry = builder.SetOnRetryBehavior(this.logger).AddCaching().BuildAsync(); + this.cachingAsyncRetryExceptWhenExists = builder + .SetOnRetryBehavior(this.logger, OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists()) + .AddCaching().BuildAsync(); + this.cachingAsyncRetryExceptWhenNotFound = builder + .SetOnRetryBehavior(this.logger, OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound()) + .AddCaching().BuildAsync(); } private static readonly string[] CreationErrorFoundCodes = new[] @@ -59,33 +70,29 @@ public CachingWithRetriesAzureProxy(IAzureProxy azureProxy, CachingRetryHandler }; /// - /// Rethrows exception if exception is and the Batch API call returned otherwise invokes . + /// Rethrows exception if exception is and the Batch API call returned otherwise invokes 's logger. /// - /// Polly retry handler. /// - private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(RetryHandler.OnRetryHandler OnRetry) - => new((outcome, timespan, retryCount, correlationId) => + private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists() + => new((exception, _, _, _, _) => { - if (outcome is BatchException batchException && CreationErrorFoundCodes.Contains(batchException.RequestInformation?.BatchError?.Code, StringComparer.OrdinalIgnoreCase)) + if (exception is BatchException batchException && CreationErrorFoundCodes.Contains(batchException.RequestInformation?.BatchError?.Code, StringComparer.OrdinalIgnoreCase)) { - ExceptionDispatchInfo.Capture(outcome).Throw(); + ExceptionDispatchInfo.Capture(exception).Throw(); } - OnRetry?.Invoke(outcome, timespan, retryCount, correlationId); }); /// - /// Rethrows exception if exception is and the Batch API call returned otherwise invokes . + /// Rethrows exception if exception is and the Batch API call returned otherwise invokes 's logger. /// - /// Polly retry handler. /// - private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(RetryHandler.OnRetryHandler OnRetry) - => new((outcome, timespan, retryCount, correlationId) => + private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound() + => new((exception, _, _, _, _) => { - if (outcome is BatchException batchException && DeletionErrorFoundCodes.Contains(batchException.RequestInformation?.BatchError?.Code, StringComparer.OrdinalIgnoreCase)) + if (exception is BatchException batchException && DeletionErrorFoundCodes.Contains(batchException.RequestInformation?.BatchError?.Code, StringComparer.OrdinalIgnoreCase)) { - ExceptionDispatchInfo.Capture(outcome).Throw(); + ExceptionDispatchInfo.Capture(exception).Throw(); } - OnRetry?.Invoke(outcome, timespan, retryCount, correlationId); }); @@ -97,8 +104,7 @@ public async Task CreateBatchJobAsync(PoolInformation poolInformation, Cancellat { try { - await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.CreateBatchJobAsync(poolInformation, ct), cancellationToken, - OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(RetryHandler.LogRetryErrorOnRetryHandler(logger))); + await cachingAsyncRetryExceptWhenExists.ExecuteWithRetryAsync(ct => azureProxy.CreateBatchJobAsync(poolInformation, ct), cancellationToken); } catch (BatchException exc) when (BatchErrorCodeStrings.JobExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } @@ -109,8 +115,7 @@ public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, PoolI { try { - await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.AddBatchTaskAsync(tesTaskId, cloudTask, poolInformation, ct), cancellationToken, - OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(RetryHandler.LogRetryErrorOnRetryHandler(logger))); + await cachingAsyncRetryExceptWhenExists.ExecuteWithRetryAsync(ct => azureProxy.AddBatchTaskAsync(tesTaskId, cloudTask, poolInformation, ct), cancellationToken); } catch (BatchException exc) when (BatchErrorCodeStrings.TaskExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } @@ -121,8 +126,7 @@ public async Task DeleteBatchJobAsync(PoolInformation poolInformation, Cancellat { try { - await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(poolInformation, ct), cancellationToken, - OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(RetryHandler.LogRetryErrorOnRetryHandler(logger))); + await cachingAsyncRetryExceptWhenNotFound.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(poolInformation, ct), cancellationToken); } catch (BatchException exc) when (BatchErrorCodeStrings.JobNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } @@ -133,8 +137,7 @@ public async Task DeleteBatchJobAsync(string taskId, CancellationToken cancellat { try { - await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(taskId, ct), cancellationToken, - OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(RetryHandler.LogRetryErrorOnRetryHandler(logger))); + await cachingAsyncRetryExceptWhenNotFound.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchJobAsync(taskId, ct), cancellationToken); } catch (BatchException exc) when (BatchErrorCodeStrings.JobNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } @@ -145,8 +148,7 @@ public async Task DeleteBatchTaskAsync(string taskId, PoolInformation poolInform { try { - await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(taskId, poolInformation, ct), cancellationToken, - OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(RetryHandler.LogRetryErrorOnRetryHandler(logger))); + await cachingAsyncRetryExceptWhenNotFound.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(taskId, poolInformation, ct), cancellationToken); } catch (BatchException exc) when (BatchErrorCodeStrings.TaskNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } @@ -157,8 +159,7 @@ public async Task DeleteBatchPoolAsync(string poolId, CancellationToken cancella { try { - await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchPoolAsync(poolId, ct), cancellationToken, - OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound(RetryHandler.LogRetryErrorOnRetryHandler(logger))); + await cachingAsyncRetryExceptWhenNotFound.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchPoolAsync(poolId, ct), cancellationToken); } catch (BatchException exc) when (BatchErrorCodeStrings.TaskNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } @@ -167,116 +168,102 @@ await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchPool /// public Task GetBatchPoolAsync(string poolId, CancellationToken cancellationToken, DetailLevel detailLevel) { - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchPoolAsync(poolId, ct, detailLevel), cancellationToken, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetBatchPoolAsync(poolId, ct, detailLevel), cancellationToken); } /// public Task GetBatchJobAsync(string jobId, CancellationToken cancellationToken, DetailLevel detailLevel) { - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAsync(jobId, ct, detailLevel), cancellationToken, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAsync(jobId, ct, detailLevel), cancellationToken); } /// public async Task DeleteBatchComputeNodesAsync(string poolId, IEnumerable computeNodes, CancellationToken cancellationToken) { - cachingRetryHandler.AppCache.Remove($"{nameof(CachingWithRetriesAzureProxy)}:{poolId}"); - await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchComputeNodesAsync(poolId, computeNodes, ct), cancellationToken); + cachingAsyncRetry.AppCache.Remove($"{nameof(CachingWithRetriesAzureProxy)}:{poolId}"); + await cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchComputeNodesAsync(poolId, computeNodes, ct), cancellationToken); } /// public Task DownloadBlobAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) { - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.DownloadBlobAsync(blobAbsoluteUri, ct), cancellationToken, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.DownloadBlobAsync(blobAbsoluteUri, ct), cancellationToken); } /// public Task BlobExistsAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) { - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.BlobExistsAsync(blobAbsoluteUri, ct), cancellationToken, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.BlobExistsAsync(blobAbsoluteUri, ct), cancellationToken); } /// public Task> GetActivePoolIdsAsync(string prefix, TimeSpan minAge, CancellationToken cancellationToken) { - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetActivePoolIdsAsync(prefix, minAge, ct), cancellationToken, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetActivePoolIdsAsync(prefix, minAge, ct), cancellationToken); } /// public IAsyncEnumerable GetActivePoolsAsync(string hostName) { - return cachingRetryHandler.ExecuteWithRetry(() => azureProxy.GetActivePoolsAsync(hostName), - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingRetry.ExecuteWithRetry(() => azureProxy.GetActivePoolsAsync(hostName)); } /// public int GetBatchActiveJobCount() { - return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActiveJobCount, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingRetry.ExecuteWithRetry(azureProxy.GetBatchActiveJobCount); } /// public IEnumerable GetBatchActiveNodeCountByVmSize() { - return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActiveNodeCountByVmSize, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingRetry.ExecuteWithRetry(azureProxy.GetBatchActiveNodeCountByVmSize); } /// public int GetBatchActivePoolCount() { - return cachingRetryHandler.ExecuteWithRetry(azureProxy.GetBatchActivePoolCount, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingRetry.ExecuteWithRetry(azureProxy.GetBatchActivePoolCount); } /// public Task GetBatchJobAndTaskStateAsync(Tes.Models.TesTask tesTask, bool usingAutoPools, CancellationToken cancellationToken) { - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAndTaskStateAsync(tesTask, usingAutoPools, ct), cancellationToken, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAndTaskStateAsync(tesTask, usingAutoPools, ct), cancellationToken); } /// public Task GetNextBatchJobIdAsync(string tesTaskId, CancellationToken cancellationToken) { - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetNextBatchJobIdAsync(tesTaskId, ct), cancellationToken, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetNextBatchJobIdAsync(tesTaskId, ct), cancellationToken); } /// public Task> GetPoolIdsReferencedByJobsAsync(CancellationToken cancellationToken) { - return cachingRetryHandler.ExecuteWithRetryAsync(azureProxy.GetPoolIdsReferencedByJobsAsync, cancellationToken, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingAsyncRetry.ExecuteWithRetryAsync(azureProxy.GetPoolIdsReferencedByJobsAsync, cancellationToken); } /// public Task GetStorageAccountKeyAsync(StorageAccountInfo storageAccountInfo, CancellationToken cancellationToken) { - return cachingRetryHandler.ExecuteWithRetryAndCachingAsync($"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountInfo.Id}", - ct => azureProxy.GetStorageAccountKeyAsync(storageAccountInfo, ct), DateTimeOffset.Now.AddHours(1), cancellationToken, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingAsyncRetry.ExecuteWithRetryAndCachingAsync($"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountInfo.Id}", + ct => azureProxy.GetStorageAccountKeyAsync(storageAccountInfo, ct), DateTimeOffset.Now.AddHours(1), cancellationToken); } /// public async Task GetStorageAccountInfoAsync(string storageAccountName, CancellationToken cancellationToken) { - var storageAccountInfo = cachingRetryHandler.AppCache.Get($"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountName}"); + var storageAccountInfo = cachingAsyncRetry.AppCache.Get($"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountName}"); if (storageAccountInfo is null) { - storageAccountInfo = await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetStorageAccountInfoAsync(storageAccountName, ct), cancellationToken, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + storageAccountInfo = await cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetStorageAccountInfoAsync(storageAccountName, ct), cancellationToken); if (storageAccountInfo is not null) { - cachingRetryHandler.AppCache.Set($"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountName}", storageAccountInfo, DateTimeOffset.MaxValue); + cachingAsyncRetry.AppCache.Set($"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountName}", storageAccountInfo, DateTimeOffset.MaxValue); } } @@ -286,43 +273,37 @@ public async Task GetStorageAccountInfoAsync(string storageA /// public Task> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) { - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListBlobsAsync(directoryUri, ct), cancellationToken, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.ListBlobsAsync(directoryUri, ct), cancellationToken); } /// public Task> ListOldJobsToDeleteAsync(TimeSpan oldestJobAge, CancellationToken cancellationToken) { - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListOldJobsToDeleteAsync(oldestJobAge, ct), cancellationToken, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.ListOldJobsToDeleteAsync(oldestJobAge, ct), cancellationToken); } /// public Task> ListOrphanedJobsToDeleteAsync(TimeSpan minJobAge, CancellationToken cancellationToken) { - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.ListOrphanedJobsToDeleteAsync(minJobAge, ct), cancellationToken, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.ListOrphanedJobsToDeleteAsync(minJobAge, ct), cancellationToken); } /// public Task UploadBlobAsync(Uri blobAbsoluteUri, string content, CancellationToken cancellationToken) { - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobAsync(blobAbsoluteUri, content, ct), cancellationToken, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobAsync(blobAbsoluteUri, content, ct), cancellationToken); } /// public Task UploadBlobFromFileAsync(Uri blobAbsoluteUri, string filePath, CancellationToken cancellationToken) { - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobFromFileAsync(blobAbsoluteUri, filePath, ct), cancellationToken, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobFromFileAsync(blobAbsoluteUri, filePath, ct), cancellationToken); } /// public Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) { - return cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.GetBlobPropertiesAsync(blobAbsoluteUri, ct), cancellationToken, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetBlobPropertiesAsync(blobAbsoluteUri, ct), cancellationToken); } /// @@ -339,8 +320,7 @@ public async Task CreateBatchPoolAsync(BatchModels.Pool poolInf { try { - return await cachingRetryHandler.ExecuteWithRetryAsync(ct => azureProxy.CreateBatchPoolAsync(poolInfo, isPreemptable, ct), cancellationToken, - OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists(RetryHandler.LogRetryErrorOnRetryHandler(logger))); + return await cachingAsyncRetryExceptWhenExists.ExecuteWithRetryAsync(ct => azureProxy.CreateBatchPoolAsync(poolInfo, isPreemptable, ct), cancellationToken); } catch (BatchException exc) when (BatchErrorCodeStrings.PoolExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { @@ -355,25 +335,23 @@ public Task DeleteBatchPoolIfExistsAsync(string poolId, CancellationToken cancel /// public Task GetFullAllocationStateAsync(string poolId, CancellationToken cancellationToken) { - return cachingRetryHandler.ExecuteWithRetryAndCachingAsync( + return cachingAsyncRetry.ExecuteWithRetryAndCachingAsync( $"{nameof(CachingWithRetriesAzureProxy)}:{poolId}", ct => azureProxy.GetFullAllocationStateAsync(poolId, ct), DateTimeOffset.Now.Add(BatchPoolService.RunInterval).Subtract(TimeSpan.FromSeconds(1)), - cancellationToken, RetryHandler.LogRetryErrorOnRetryHandler(logger)); + cancellationToken); } /// public IAsyncEnumerable ListComputeNodesAsync(string poolId, DetailLevel detailLevel) { - return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListComputeNodesAsync(poolId, detailLevel), cachingRetryHandler.RetryPolicy, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingAsyncRetry.ExecuteAsync(() => azureProxy.ListComputeNodesAsync(poolId, detailLevel), cachingRetry); } /// public IAsyncEnumerable ListTasksAsync(string jobId, DetailLevel detailLevel) { - return cachingRetryHandler.AsyncRetryPolicy.ExecuteAsync(() => azureProxy.ListTasksAsync(jobId, detailLevel), cachingRetryHandler.RetryPolicy, - RetryHandler.LogRetryErrorOnRetryHandler(logger)); + return cachingAsyncRetry.ExecuteAsync(() => azureProxy.ListTasksAsync(jobId, detailLevel), cachingRetry); } /// diff --git a/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs b/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs index c2c3eef2c..0847da7b9 100644 --- a/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs +++ b/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs @@ -9,6 +9,7 @@ using Microsoft.Azure.Management.ResourceManager.Fluent.Core; using Microsoft.Rest.Azure; using Polly.Retry; +using Tes.ApiClients; // TODO: move this to Common.csproj? namespace TesApi.Web.Extensions @@ -57,15 +58,15 @@ public static IAsyncEnumerable ToAsyncEnumerable(this IPage source, Fun /// Policy retrying calls made while enumerating results returned by . /// Method returning . /// Policy retrying call to . - /// to use. Defaults to none. Assumes policies configured like the ones in . + /// Name of method originating the retriable operation. /// - public static IAsyncEnumerable ExecuteAsync(this AsyncRetryPolicy asyncRetryPolicy, Func> func, RetryPolicy retryPolicy, Tes.ApiClients.RetryHandler.OnRetryHandler onRetry = default) + public static IAsyncEnumerable ExecuteAsync(this Polly.IAsyncPolicy asyncRetryPolicy, Func> func, Polly.ISyncPolicy retryPolicy, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) { ArgumentNullException.ThrowIfNull(asyncRetryPolicy); ArgumentNullException.ThrowIfNull(func); ArgumentNullException.ThrowIfNull(retryPolicy); - var ctx = Tes.ApiClients.RetryHandler.PrepareContext(onRetry); + var ctx = new Polly.Context { [RetryHandler.CallerMemberNameKey] = caller }; return new PollyAsyncEnumerable(retryPolicy.Execute(_ => func(), ctx), asyncRetryPolicy, ctx); } @@ -102,10 +103,10 @@ public IAsyncEnumerator GetAsyncEnumerator(CancellationToken cancellationToke private sealed class PollyAsyncEnumerable : IAsyncEnumerable { private readonly IAsyncEnumerable _source; - private readonly AsyncRetryPolicy _retryPolicy; + private readonly Polly.IAsyncPolicy _retryPolicy; private readonly Polly.Context _ctx; - public PollyAsyncEnumerable(IAsyncEnumerable source, AsyncRetryPolicy retryPolicy, Polly.Context ctx) + public PollyAsyncEnumerable(IAsyncEnumerable source, Polly.IAsyncPolicy retryPolicy, Polly.Context ctx) { ArgumentNullException.ThrowIfNull(source); ArgumentNullException.ThrowIfNull(retryPolicy); @@ -123,11 +124,11 @@ IAsyncEnumerator IAsyncEnumerable.GetAsyncEnumerator(CancellationToken can private sealed class PollyAsyncEnumerator : IAsyncEnumerator { private readonly IAsyncEnumerator _source; - private readonly AsyncRetryPolicy _retryPolicy; + private readonly Polly.IAsyncPolicy _retryPolicy; private readonly CancellationToken _cancellationToken; private readonly Polly.Context _ctx; - public PollyAsyncEnumerator(IAsyncEnumerator source, AsyncRetryPolicy retryPolicy, Polly.Context ctx, CancellationToken cancellationToken) + public PollyAsyncEnumerator(IAsyncEnumerator source, Polly.IAsyncPolicy retryPolicy, Polly.Context ctx, CancellationToken cancellationToken) { ArgumentNullException.ThrowIfNull(source); ArgumentNullException.ThrowIfNull(retryPolicy); diff --git a/src/TesApi.Web/Management/AzureProvider.cs b/src/TesApi.Web/Management/AzureProvider.cs index ac60eacbd..536d372a5 100644 --- a/src/TesApi.Web/Management/AzureProvider.cs +++ b/src/TesApi.Web/Management/AzureProvider.cs @@ -12,14 +12,14 @@ namespace TesApi.Web.Management /// public abstract class AzureProvider { - private protected readonly CachingRetryHandler CachingRetryHandler; + private protected readonly CachingRetryHandler.ICachingAsyncPolicy CachingAsyncRetryPolicy; private protected readonly AzureManagementClientsFactory ManagementClientsFactory; private protected readonly ILogger Logger; /// /// Protected constructor AzureProvider /// - /// + /// /// /// > protected AzureProvider(CachingRetryHandler cachingRetryHandler, AzureManagementClientsFactory managementClientsFactory, ILogger logger) @@ -28,9 +28,13 @@ protected AzureProvider(CachingRetryHandler cachingRetryHandler, AzureManagement ArgumentNullException.ThrowIfNull(managementClientsFactory); ArgumentNullException.ThrowIfNull(logger); - this.CachingRetryHandler = cachingRetryHandler; this.ManagementClientsFactory = managementClientsFactory; this.Logger = logger; + this.CachingAsyncRetryPolicy = cachingRetryHandler + .RetryDefaultPolicyBuilder() + .SetOnRetryBehavior(this.Logger) + .AddCaching() + .BuildAsync(); } /// diff --git a/src/TesApi.Web/Management/ContainerRegistryProvider.cs b/src/TesApi.Web/Management/ContainerRegistryProvider.cs index cba4a3a63..c9dc3f0a1 100644 --- a/src/TesApi.Web/Management/ContainerRegistryProvider.cs +++ b/src/TesApi.Web/Management/ContainerRegistryProvider.cs @@ -65,7 +65,7 @@ public virtual async Task GetContainerRegistryInfoAsync(s return null; } - var containerRegistryInfo = CachingRetryHandler.AppCache.Get($"{nameof(ContainerRegistryProvider)}:{imageName}"); + var containerRegistryInfo = CachingAsyncRetryPolicy.AppCache.Get($"{nameof(ContainerRegistryProvider)}:{imageName}"); if (containerRegistryInfo is not null) { @@ -99,7 +99,7 @@ public bool IsImagePublic(string imageName) private async Task LookUpAndAddToCacheContainerRegistryInfoAsync(string imageName, CancellationToken cancellationToken) { - var repositories = await CachingRetryHandler.ExecuteWithRetryAsync(GetAccessibleContainerRegistriesAsync, cancellationToken, RetryHandler.LogRetryErrorOnRetryHandler(Logger)); + var repositories = await CachingAsyncRetryPolicy.ExecuteWithRetryAsync(GetAccessibleContainerRegistriesAsync, cancellationToken); var requestedRepo = repositories?.FirstOrDefault(reg => reg.RegistryServer.Equals(imageName.Split('/').FirstOrDefault(), StringComparison.OrdinalIgnoreCase)); @@ -107,7 +107,7 @@ private async Task LookUpAndAddToCacheContainerRegistryIn if (requestedRepo is not null) { Logger.LogInformation(@"Requested repository: {DockerImage} was found.", imageName); - CachingRetryHandler.AppCache.Set($"{nameof(ContainerRegistryProvider)}:{imageName}", requestedRepo, DateTimeOffset.UtcNow.AddHours(options.RegistryInfoCacheExpirationInHours)); + CachingAsyncRetryPolicy.AppCache.Set($"{nameof(ContainerRegistryProvider)}:{imageName}", requestedRepo, DateTimeOffset.UtcNow.AddHours(options.RegistryInfoCacheExpirationInHours)); } else { diff --git a/src/TesApi.Web/Startup.cs b/src/TesApi.Web/Startup.cs index 41e12ab24..6b97fe3d1 100644 --- a/src/TesApi.Web/Startup.cs +++ b/src/TesApi.Web/Startup.cs @@ -99,6 +99,7 @@ public void ConfigureServices(IServiceCollection services) .AddAutoMapper(typeof(MappingProfilePoolToWsmRequest)) .AddSingleton() .AddSingleton() + .AddSingleton(s => s.GetRequiredService()) .AddSingleton() .AddSingleton() .AddSingleton() From 48ef66a69de8119ded436948a8b84ace729f2525 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 23 Nov 2023 01:26:46 -0800 Subject: [PATCH 098/202] formatting --- .../TestServices/RetryHandlersHelpers-HttpResponseMessage.cs | 2 +- src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers-HttpResponseMessage.cs b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers-HttpResponseMessage.cs index 9a8bdfc64..1e183f18b 100644 --- a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers-HttpResponseMessage.cs +++ b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers-HttpResponseMessage.cs @@ -2,8 +2,8 @@ // Licensed under the MIT License. using Moq; -using Polly.Retry; using Polly; +using Polly.Retry; using static Tes.ApiClients.CachingRetryHandler; using static Tes.ApiClients.RetryHandler; diff --git a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs index 9d64f2a70..1b3ce7948 100644 --- a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs +++ b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs @@ -2,8 +2,8 @@ // Licensed under the MIT License. using Moq; -using Polly.Retry; using Polly; +using Polly.Retry; using static Tes.ApiClients.CachingRetryHandler; using static Tes.ApiClients.RetryHandler; From 579a7a5c2c8ff300b4f4a21fe62ff3dff69a8153 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 23 Nov 2023 01:27:28 -0800 Subject: [PATCH 099/202] Futureproof Polly v7 code generation (since v8 is a complete refactor) --- src/Tes.ApiClients/CachingRetryHandler.cs | 42 ++++++++++------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/src/Tes.ApiClients/CachingRetryHandler.cs b/src/Tes.ApiClients/CachingRetryHandler.cs index 50b837dae..4cf62b26f 100644 --- a/src/Tes.ApiClients/CachingRetryHandler.cs +++ b/src/Tes.ApiClients/CachingRetryHandler.cs @@ -176,10 +176,9 @@ ICachingAsyncPolicy ICachingPolicyBuilderBuild.BuildAsync() private partial class CachingRetryPolicy : IRetryPolicy, ICachingSyncPolicy { - [BeaKona.AutoInterface] + [BeaKona.AutoInterface(typeof(ISyncPolicy), IncludeBaseInterfaces = true)] + [BeaKona.AutoInterface(typeof(IRetryPolicy), IncludeBaseInterfaces = true)] private readonly ISyncPolicy policy; - [BeaKona.AutoInterface] - private readonly IsPolicy retryPolicy; private readonly CachingRetryHandler handler; public IMemoryCache AppCache => handler.AppCache; @@ -188,7 +187,6 @@ private partial class CachingRetryPolicy : IRetryPolicy, ICachingSyncPolicy public CachingRetryPolicy(CachingRetryHandler handler, ISyncPolicy policy) { ArgumentNullException.ThrowIfNull(policy); - retryPolicy = (IRetryPolicy)policy; this.policy = policy; this.handler = handler; } @@ -196,10 +194,10 @@ public CachingRetryPolicy(CachingRetryHandler handler, ISyncPolicy policy) private partial class CachingRetryPolicy : IRetryPolicy, ICachingSyncPolicy { - [BeaKona.AutoInterface] + [BeaKona.AutoInterface(IncludeBaseInterfaces = true)] private readonly ISyncPolicy policy; - [BeaKona.AutoInterface] - private readonly IsPolicy retryPolicy; + [BeaKona.AutoInterface(IncludeBaseInterfaces = true)] + private readonly IRetryPolicy retryPolicy; private readonly CachingRetryHandler handler; public IMemoryCache AppCache => handler.AppCache; @@ -208,46 +206,44 @@ private partial class CachingRetryPolicy : IRetryPolicy, ICach public CachingRetryPolicy(CachingRetryHandler handler, ISyncPolicy policy) { ArgumentNullException.ThrowIfNull(policy); - retryPolicy = (IRetryPolicy)policy; + retryPolicy = (IRetryPolicy)policy; this.policy = policy; this.handler = handler; } } - private partial class CachingAsyncRetryPolicy : IRetryPolicy, ICachingAsyncPolicy + private partial class CachingAsyncRetryPolicy : IRetryPolicy, ICachingAsyncPolicy { - [BeaKona.AutoInterface] - private readonly IAsyncPolicy policy; - [BeaKona.AutoInterface] - private readonly IsPolicy retryPolicy; + [BeaKona.AutoInterface(typeof(IAsyncPolicy), IncludeBaseInterfaces = true)] + [BeaKona.AutoInterface(typeof(IRetryPolicy), IncludeBaseInterfaces = true)] + private readonly IAsyncPolicy policy; private readonly CachingRetryHandler handler; public IMemoryCache AppCache => handler.AppCache; CachingRetryHandler ICachingPolicy.Handler => handler; - public CachingAsyncRetryPolicy(CachingRetryHandler handler, IAsyncPolicy policy) + public CachingAsyncRetryPolicy(CachingRetryHandler handler, IAsyncPolicy policy) { - retryPolicy = (IRetryPolicy)policy; + ArgumentNullException.ThrowIfNull(policy); this.policy = policy; this.handler = handler; } } - private partial class CachingAsyncRetryPolicy : IRetryPolicy, ICachingAsyncPolicy + private partial class CachingAsyncRetryPolicy : IRetryPolicy, ICachingAsyncPolicy { - [BeaKona.AutoInterface] - private readonly IAsyncPolicy policy; - [BeaKona.AutoInterface] - private readonly IsPolicy retryPolicy; + [BeaKona.AutoInterface(IncludeBaseInterfaces = true)] + private readonly IAsyncPolicy policy; + [BeaKona.AutoInterface(IncludeBaseInterfaces = true)] + private readonly IRetryPolicy retryPolicy; private readonly CachingRetryHandler handler; public IMemoryCache AppCache => handler.AppCache; CachingRetryHandler ICachingPolicy.Handler => handler; - public CachingAsyncRetryPolicy(CachingRetryHandler handler, IAsyncPolicy policy) + public CachingAsyncRetryPolicy(CachingRetryHandler handler, IAsyncPolicy policy) { - ArgumentNullException.ThrowIfNull(policy); - retryPolicy = (IRetryPolicy)policy; + retryPolicy = (IRetryPolicy)policy; this.policy = policy; this.handler = handler; } From 0d508396389d76bcd5616014c045a61c50856c1d Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 29 Nov 2023 10:32:57 -0800 Subject: [PATCH 100/202] Refactor builders --- src/Tes.ApiClients/CachingRetryHandler.cs | 29 ++- src/Tes.ApiClients/RetryHandler.cs | 297 +++++++++++----------- 2 files changed, 164 insertions(+), 162 deletions(-) diff --git a/src/Tes.ApiClients/CachingRetryHandler.cs b/src/Tes.ApiClients/CachingRetryHandler.cs index 4cf62b26f..4e17d1e52 100644 --- a/src/Tes.ApiClients/CachingRetryHandler.cs +++ b/src/Tes.ApiClients/CachingRetryHandler.cs @@ -128,15 +128,14 @@ public interface ICachingPolicyBuilderHandler #region Builder interface implementations ICachingPolicyBuilderBuild ICachingPolicyBuilderHandler.CachingPolicyBuilder(IPolicyBuilderBuild policyBuilder) - => new CachingPolicyBuilderBuild(policyBuilder, this); + => new CachingPolicyBuilderBuild(policyBuilder, this); ICachingPolicyBuilderBuild ICachingPolicyBuilderHandler.CachingPolicyBuilder(IPolicyBuilderBuild policyBuilder) => new CachingPolicyBuilderBuild(policyBuilder, this); - private readonly struct CachingPolicyBuilderBuild : ICachingPolicyBuilderBuild, ICachingPolicyBuilderBuild + private readonly struct CachingPolicyBuilderBuild : ICachingPolicyBuilderBuild { private readonly IPolicyBuilderBuild policyBuilder; - private readonly IPolicyBuilderBuild genericPolicyBuilder; private readonly CachingRetryHandler cachingHandler; public CachingPolicyBuilderBuild(IPolicyBuilderBuild policyBuilder, CachingRetryHandler handler) @@ -146,13 +145,6 @@ public CachingPolicyBuilderBuild(IPolicyBuilderBuild policyBuilder, CachingRetry this.cachingHandler = handler; } - public CachingPolicyBuilderBuild(IPolicyBuilderBuild policyBuilder, CachingRetryHandler handler) - { - ArgumentNullException.ThrowIfNull(policyBuilder); - this.genericPolicyBuilder = policyBuilder; - this.cachingHandler = handler; - } - public ICachingSyncPolicy Build() { return new CachingRetryPolicy(cachingHandler, policyBuilder.Build()); @@ -162,15 +154,28 @@ public ICachingAsyncPolicy BuildAsync() { return new CachingAsyncRetryPolicy(cachingHandler, policyBuilder.BuildAsync()); } + } + + private readonly struct CachingPolicyBuilderBuild : ICachingPolicyBuilderBuild + { + private readonly IPolicyBuilderBuild policyBuilder; + private readonly CachingRetryHandler cachingHandler; + + public CachingPolicyBuilderBuild(IPolicyBuilderBuild policyBuilder, CachingRetryHandler handler) + { + ArgumentNullException.ThrowIfNull(policyBuilder); + this.policyBuilder = policyBuilder; + this.cachingHandler = handler; + } ICachingSyncPolicy ICachingPolicyBuilderBuild.Build() { - return new CachingRetryPolicy(cachingHandler, genericPolicyBuilder.Build()); + return new CachingRetryPolicy(cachingHandler, policyBuilder.Build()); } ICachingAsyncPolicy ICachingPolicyBuilderBuild.BuildAsync() { - return new CachingAsyncRetryPolicy(cachingHandler, genericPolicyBuilder.BuildAsync()); + return new CachingAsyncRetryPolicy(cachingHandler, policyBuilder.BuildAsync()); } } diff --git a/src/Tes.ApiClients/RetryHandler.cs b/src/Tes.ApiClients/RetryHandler.cs index 5234d50b0..3ea7436db 100644 --- a/src/Tes.ApiClients/RetryHandler.cs +++ b/src/Tes.ApiClients/RetryHandler.cs @@ -392,29 +392,36 @@ internal PolicyBuilderPolicy(Defaults options) /// IPolicyBuilderBase IPolicyBuilderPolicy.OpinionatedRetryPolicy() - => new PolicyBuilderBase(Policy.Handle(), Defaults); + => new PolicyBuilderBase(Policy.Handle(), Defaults); /// IPolicyBuilderBase IPolicyBuilderPolicy.OpinionatedRetryPolicy(PolicyBuilder policy) - => new PolicyBuilderBase(policy, Defaults); + => new PolicyBuilderBase(policy, Defaults); /// IPolicyBuilderBase IPolicyBuilderPolicy.OpinionatedRetryPolicy(PolicyBuilder policy) => new PolicyBuilderBase(policy, Defaults); - private readonly struct PolicyBuilderBase : IPolicyBuilderBase, IPolicyBuilderBase + private readonly struct PolicyBuilderBase : IPolicyBuilderBase { - private readonly PolicyBuilder policyBuilder; - private readonly PolicyBuilder genericPolicyBuilder; - private readonly Defaults Defaults; + public readonly PolicyBuilder policyBuilder; + public readonly Defaults Defaults; - private Func DefaultSleepDurationProvider() - => ExponentialSleepDurationProvider(Defaults.PolicyOptions.ExponentialBackOffExponent); + internal PolicyBuilderBase(PolicyBuilder policyBuilder, Defaults defaults) + { + ArgumentNullException.ThrowIfNull(policyBuilder); + ArgumentNullException.ThrowIfNull(defaults); + this.policyBuilder = policyBuilder; + Defaults = defaults; + } + + public static Func DefaultSleepDurationProvider(Defaults defaults) + => ExponentialSleepDurationProvider(defaults.PolicyOptions.ExponentialBackOffExponent); - private static Func ExponentialSleepDurationProvider(double exponentialBackOffExponent) + public static Func ExponentialSleepDurationProvider(double exponentialBackOffExponent) => (attempt, _1, _2) => TimeSpan.FromSeconds(Math.Pow(exponentialBackOffExponent, attempt)); - private static Func ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(Func sleepDurationProvider, int exponentialBackOffExponent, bool backupSkipProvidedIncrements) + public static Func ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(Func sleepDurationProvider, int exponentialBackOffExponent, bool backupSkipProvidedIncrements) => (attempt, exception, ctx) => { return backupSkipProvidedIncrements @@ -442,203 +449,153 @@ TimeSpan AdjustAttemptIfNeeded() } }; - internal PolicyBuilderBase(PolicyBuilder policyBuilder, Defaults defaults) - { - ArgumentNullException.ThrowIfNull(policyBuilder); - ArgumentNullException.ThrowIfNull(defaults); - this.policyBuilder = policyBuilder; - Defaults = defaults; - } - - internal PolicyBuilderBase(PolicyBuilder policyBuilder, Defaults defaults) - { - ArgumentNullException.ThrowIfNull(policyBuilder); - ArgumentNullException.ThrowIfNull(defaults); - this.genericPolicyBuilder = policyBuilder; - Defaults = defaults; - } /// IPolicyBuilderWait IPolicyBuilderBase.WithRetryPolicyOptionsWait() - => new PolicyBuilderWait(policyBuilder, Defaults.PolicyBuilderBase, Defaults.PolicyOptions.MaxRetryCount, DefaultSleepDurationProvider()); + => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, DefaultSleepDurationProvider(Defaults)); /// IPolicyBuilderWait IPolicyBuilderBase.WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func sleepDurationProvider) - => new PolicyBuilderWait(policyBuilder, Defaults.PolicyBuilderBase, maxRetryCount, (attempt, outcome, _1) => sleepDurationProvider(attempt, outcome)); + => new PolicyBuilderWait(this, maxRetryCount, (attempt, outcome, _1) => sleepDurationProvider(attempt, outcome)); /// IPolicyBuilderWait IPolicyBuilderBase.WithExponentialBackoffWait(int retryCount, double exponentialBackOffExponent) - => new PolicyBuilderWait(policyBuilder, Defaults.PolicyBuilderBase, retryCount, ExponentialSleepDurationProvider(exponentialBackOffExponent)); + => new PolicyBuilderWait(this, retryCount, ExponentialSleepDurationProvider(exponentialBackOffExponent)); /// IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func sleepDurationProvider, bool backupSkipProvidedIncrements) - => new PolicyBuilderWait(policyBuilder, Defaults.PolicyBuilderBase, Defaults.PolicyOptions.MaxRetryCount, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, Defaults.PolicyOptions.ExponentialBackOffExponent, backupSkipProvidedIncrements)); + => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, Defaults.PolicyOptions.ExponentialBackOffExponent, backupSkipProvidedIncrements)); /// IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithExponentialBackoffBackup(Func sleepDurationProvider, int retryCount, int exponentialBackOffExponent, bool backupSkipProvidedIncrements) - => new PolicyBuilderWait(policyBuilder, Defaults.PolicyBuilderBase, retryCount, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, exponentialBackOffExponent, backupSkipProvidedIncrements)); + => new PolicyBuilderWait(this, retryCount, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, exponentialBackOffExponent, backupSkipProvidedIncrements)); + } + private readonly struct PolicyBuilderBase : IPolicyBuilderBase + { + public readonly PolicyBuilder policyBuilder; + public readonly Defaults Defaults; + + internal PolicyBuilderBase(PolicyBuilder policyBuilder, Defaults defaults) + { + ArgumentNullException.ThrowIfNull(policyBuilder); + ArgumentNullException.ThrowIfNull(defaults); + this.policyBuilder = policyBuilder; + Defaults = defaults; + } /// IPolicyBuilderWait IPolicyBuilderBase.WithRetryPolicyOptionsWait() - => new PolicyBuilderWait(genericPolicyBuilder, Defaults.PolicyBuilderBase, Defaults.PolicyOptions.MaxRetryCount, default, DefaultSleepDurationProvider()); + => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, default, PolicyBuilderBase.DefaultSleepDurationProvider(Defaults)); /// IPolicyBuilderWait IPolicyBuilderBase.WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func waitDurationProvider) - => new PolicyBuilderWait(genericPolicyBuilder, Defaults.PolicyBuilderBase, maxRetryCount, default, (attempt, outcome, _1) => waitDurationProvider(attempt, outcome)); + => new PolicyBuilderWait(this, maxRetryCount, default, (attempt, outcome, _1) => waitDurationProvider(attempt, outcome)); /// IPolicyBuilderWait IPolicyBuilderBase.WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func, TimeSpan> sleepDurationProvider) - => new PolicyBuilderWait(genericPolicyBuilder, Defaults.PolicyBuilderBase, maxRetryCount, (attempt, outcome, _1) => sleepDurationProvider(attempt, outcome), default); + => new PolicyBuilderWait(this, maxRetryCount, (attempt, outcome, _1) => sleepDurationProvider(attempt, outcome), default); /// IPolicyBuilderWait IPolicyBuilderBase.WithExponentialBackoffWait(int retryCount, double exponentialBackOffExponent) - => new PolicyBuilderWait(genericPolicyBuilder, Defaults.PolicyBuilderBase, retryCount, default, ExponentialSleepDurationProvider(exponentialBackOffExponent)); + => new PolicyBuilderWait(this, retryCount, default, PolicyBuilderBase.ExponentialSleepDurationProvider(exponentialBackOffExponent)); /// IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func sleepDurationProvider, bool backupSkipProvidedIncrements) - => new PolicyBuilderWait(genericPolicyBuilder, Defaults.PolicyBuilderBase, Defaults.PolicyOptions.MaxRetryCount, default, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, Defaults.PolicyOptions.ExponentialBackOffExponent, backupSkipProvidedIncrements)); + => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, default, PolicyBuilderBase.ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, Defaults.PolicyOptions.ExponentialBackOffExponent, backupSkipProvidedIncrements)); /// IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithExponentialBackoffBackup(Func sleepDurationProvider, int retryCount, int exponentialBackOffExponent, bool backupSkipProvidedIncrements) - => new PolicyBuilderWait(genericPolicyBuilder, Defaults.PolicyBuilderBase, retryCount, default, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, exponentialBackOffExponent, backupSkipProvidedIncrements)); + => new PolicyBuilderWait(this, retryCount, default, PolicyBuilderBase.ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, exponentialBackOffExponent, backupSkipProvidedIncrements)); } - private readonly struct PolicyBuilderWait : IPolicyBuilderWait, IPolicyBuilderWait + private readonly struct PolicyBuilderWait : IPolicyBuilderWait { - private readonly PolicyBuilder policyBuilder; - private readonly PolicyBuilder genericPolicyBuilder; - private readonly Func sleepDurationProvider; - private readonly Func, Context, TimeSpan> genericSleepDurationProvider; - private readonly int retryCount; - private readonly RetryHandler builderBase; + public readonly PolicyBuilderBase policyBuilder; + public readonly Func sleepDurationProvider; + public readonly int retryCount; - private static Func, Context, TimeSpan> PickSetProvider(Func, Context, TimeSpan> tResultProvider, Func exceptionProvider) - => (attempt, outcome, ctx) => tResultProvider is null ? exceptionProvider(attempt, outcome.Exception, ctx) : tResultProvider(attempt, outcome, ctx); - - internal PolicyBuilderWait(PolicyBuilder policyBuilder, RetryHandler builderBase, int retryCount, Func sleepDurationProvider) + internal PolicyBuilderWait(PolicyBuilderBase policyBuilder, int retryCount, Func sleepDurationProvider) { ArgumentNullException.ThrowIfNull(sleepDurationProvider); this.policyBuilder = policyBuilder; this.retryCount = retryCount; this.sleepDurationProvider = sleepDurationProvider; - this.builderBase = builderBase; } - internal PolicyBuilderWait(PolicyBuilder policyBuilder, RetryHandler builderBase, int retryCount, Func, Context, TimeSpan> sleepDurationProviderResult, Func sleepDurationProviderException) + /// + IPolicyBuilderBuild IPolicyBuilderWait.SetOnRetryBehavior(ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) + => new PolicyBuilderBuild(this, sleepDurationProvider, logger, onRetry, onRetryAsync); + } + + private readonly struct PolicyBuilderWait : IPolicyBuilderWait + { + public readonly PolicyBuilderBase policyBuilder; + public readonly Func sleepDurationProvider; + public readonly Func, Context, TimeSpan> genericSleepDurationProvider; + public readonly int retryCount; + + private static Func, Context, TimeSpan> PickSetProvider(Func, Context, TimeSpan> tResultProvider, Func exceptionProvider) + => (attempt, outcome, ctx) => tResultProvider is null ? exceptionProvider(attempt, outcome.Exception, ctx) : tResultProvider(attempt, outcome, ctx); + + internal PolicyBuilderWait(PolicyBuilderBase policyBuilder, int retryCount, Func, Context, TimeSpan> sleepDurationProviderResult, Func sleepDurationProviderException) { if (sleepDurationProviderException is null && sleepDurationProviderResult is null) { throw new ArgumentNullException(); } - this.genericPolicyBuilder = policyBuilder; + this.policyBuilder = policyBuilder; this.retryCount = retryCount; this.sleepDurationProvider = sleepDurationProviderException; this.genericSleepDurationProvider = sleepDurationProviderResult; - this.builderBase = builderBase; } - /// - IPolicyBuilderBuild IPolicyBuilderWait.SetOnRetryBehavior(ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) - => new PolicyBuilderBuild(policyBuilder, builderBase, retryCount, sleepDurationProvider, logger, onRetry, onRetryAsync); - /// IPolicyBuilderBuild IPolicyBuilderWait.SetOnRetryBehavior(ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) - => new PolicyBuilderBuild(genericPolicyBuilder, builderBase, retryCount, PickSetProvider(genericSleepDurationProvider, sleepDurationProvider), logger, onRetry, onRetryAsync); + => new PolicyBuilderBuild(this, PickSetProvider(genericSleepDurationProvider, sleepDurationProvider), logger, onRetry, onRetryAsync); } - private readonly struct PolicyBuilderBuild : IPolicyBuilderBuild, IPolicyBuilderBuild + private readonly struct PolicyBuilderBuild : IPolicyBuilderBuild { - private readonly PolicyBuilder policyBuilder; - private readonly PolicyBuilder genericPolicyBuilder; + private readonly PolicyBuilderWait policyBuilder; private readonly Func sleepDurationProvider; - private readonly Func, Context, TimeSpan> genericSleepDurationProvider; - private readonly int retryCount; private readonly ILogger logger; private readonly OnRetryHandler onRetryHandler; private readonly OnRetryHandlerAsync onRetryHandlerAsync; - private readonly OnRetryHandler genericOnRetryHandler; - private readonly OnRetryHandlerAsync genericOnRetryHandlerAsync; /// public RetryHandler PolicyBuilderBase { get; } - internal PolicyBuilderBuild(PolicyBuilder policyBuilder, RetryHandler builderBase, int retryCount, Func sleepDurationProvider, ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) + internal PolicyBuilderBuild(PolicyBuilderWait policyBuilder, Func sleepDurationProvider, ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) { ArgumentNullException.ThrowIfNull(sleepDurationProvider); this.policyBuilder = policyBuilder; - this.retryCount = retryCount; this.sleepDurationProvider = sleepDurationProvider; this.logger = logger; this.onRetryHandler = onRetry; this.onRetryHandlerAsync = onRetryAsync; - this.PolicyBuilderBase = builderBase; - } - - internal PolicyBuilderBuild(PolicyBuilder policyBuilder, RetryHandler builderBase, int retryCount, Func, Context, TimeSpan> sleepDurationProvider, ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) - { - ArgumentNullException.ThrowIfNull(sleepDurationProvider); - this.genericPolicyBuilder = policyBuilder; - this.retryCount = retryCount; - this.genericSleepDurationProvider = sleepDurationProvider; - this.logger = logger; - this.genericOnRetryHandler = onRetry; - this.genericOnRetryHandlerAsync = onRetryAsync; - this.PolicyBuilderBase = builderBase; + this.PolicyBuilderBase = policyBuilder.policyBuilder.Defaults.PolicyBuilderBase; } - private Action Logger() + public static Action Logger(ILogger logger) { - var logger = this.logger; - return (exception, timeSpan, retryCount, ctx) => logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan:c} CorrelationId: {CorrelationId:D} ErrorMessage: {ExceptionMessage}", ctx[CallerMemberNameKey], retryCount, timeSpan, ctx.CorrelationId, exception.Message); } - private Action, TimeSpan, int, Context> GenericLogger() - { - var logger = this.logger; - - return (outcome, timeSpan, retryCount, ctx) => - { - if (outcome.Exception is null) - { - logger?.LogError(@"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan:c} CorrelationId: {CorrelationId:D}", ctx[CallerMemberNameKey], retryCount, timeSpan, ctx.CorrelationId); - } - else - { - logger?.LogError(outcome.Exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan:c} CorrelationId: {CorrelationId:D} ErrorMessage: {ExceptionMessage}", ctx[CallerMemberNameKey], retryCount, timeSpan, ctx.CorrelationId, outcome.Exception.Message); - } - }; - } - - private Action OnRetryHandler() + public static Action OnRetryHandler(ILogger logger, OnRetryHandler onRetryHandler) { var handler = onRetryHandler ?? new((exception, timeSpan, retryCount, correlationId, caller) => { }); - var logger = Logger(); return (exception, timeSpan, retryCount, ctx) => { handler(exception, timeSpan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); - logger(exception, timeSpan, retryCount, ctx); + Logger(logger)(exception, timeSpan, retryCount, ctx); }; } - private Action, TimeSpan, int, Context> OnGenericRetryHandler() - { - var handler = onRetryHandler ?? new((exception, timeSpan, retryCount, correlationId, caller) => { }); - var genericHandler = genericOnRetryHandler ?? new((outcome, timeSpan, retryCount, correlationId, caller) => handler(outcome.Exception, timeSpan, retryCount, correlationId, caller)); - var logger = GenericLogger(); - - return (outcome, timeSpan, retryCount, ctx) => - { - genericHandler(outcome, timeSpan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); - logger(outcome, timeSpan, retryCount, ctx); - }; - } - - private Func OnRetryHandlerAsync() + public static Func OnRetryHandlerAsync(ILogger logger, OnRetryHandler onRetryHandler, OnRetryHandlerAsync onRetryHandlerAsync) { var handler = onRetryHandler ?? new((exception, timeSpan, retryCount, correlationId, caller) => { }); var handlerAsync = onRetryHandlerAsync ?? new((exception, timeSpan, retryCount, correlationId, caller) => @@ -646,73 +603,113 @@ private Func OnRetryHandlerAsync() handler(exception, timeSpan, retryCount, correlationId, caller); return Task.CompletedTask; }); - var logger = Logger(); return async (exception, timeSpan, retryCount, ctx) => { await handlerAsync(exception, timeSpan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); - logger(exception, timeSpan, retryCount, ctx); + Logger(logger)(exception, timeSpan, retryCount, ctx); }; } - private Func, TimeSpan, int, Context, Task> OnGenericRetryHandlerAsync() + /// + ISyncPolicy IPolicyBuilderBuild.Build() { - var handler = onRetryHandler ?? new((exception, timeSpan, retryCount, correlationId, caller) => { }); - var genericHandler = genericOnRetryHandler ?? new((outcome, timeSpan, retryCount, correlationId, caller) => handler(outcome.Exception, timeSpan, retryCount, correlationId, caller)); - var handlerAsync = onRetryHandlerAsync; - var genericHandlerAsync = genericOnRetryHandlerAsync ?? new((outcome, timespan, retryCount, correlationId, caller) => - { - if (handlerAsync is null) - { - genericHandler(outcome, timespan, retryCount, correlationId, caller); - return Task.CompletedTask; - } - - return handlerAsync(outcome.Exception, timespan, retryCount, correlationId, caller); - }); - var logger = GenericLogger(); + var waitProvider = sleepDurationProvider; + var onRetryProvider = OnRetryHandler(logger, onRetryHandler); - return async (outcome, timespan, retryCount, ctx) => - { - await genericHandlerAsync(outcome, timespan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); - logger(outcome, timespan, retryCount, ctx); - }; + return policyBuilder.policyBuilder.policyBuilder.WaitAndRetry(policyBuilder.retryCount, (attempt, ctx) => waitProvider(attempt, default, ctx), onRetryProvider); } /// - ISyncPolicy IPolicyBuilderBuild.Build() + IAsyncPolicy IPolicyBuilderBuild.BuildAsync() { var waitProvider = sleepDurationProvider; - var onRetryProvider = OnRetryHandler(); + var onRetryProvider = OnRetryHandlerAsync(logger, onRetryHandler, onRetryHandlerAsync); - return policyBuilder.WaitAndRetry(retryCount, (attempt, ctx) => waitProvider(attempt, default, ctx), onRetryProvider); + return policyBuilder.policyBuilder.policyBuilder.WaitAndRetryAsync(policyBuilder.retryCount, waitProvider, onRetryProvider); } + } + + private readonly struct PolicyBuilderBuild : IPolicyBuilderBuild + { + private readonly PolicyBuilderWait policyBuilder; + private readonly Func, Context, TimeSpan> sleepDurationProvider; + private readonly ILogger logger; + private readonly OnRetryHandler onRetryHandler; + private readonly OnRetryHandlerAsync onRetryHandlerAsync; /// - IAsyncPolicy IPolicyBuilderBuild.BuildAsync() + public RetryHandler PolicyBuilderBase { get; } + + internal PolicyBuilderBuild(PolicyBuilderWait policyBuilder, Func, Context, TimeSpan> sleepDurationProvider, ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) { - var waitProvider = sleepDurationProvider; - var onRetryProvider = OnRetryHandlerAsync(); + ArgumentNullException.ThrowIfNull(sleepDurationProvider); + this.policyBuilder = policyBuilder; + this.sleepDurationProvider = sleepDurationProvider; + this.logger = logger; + this.onRetryHandler = onRetry; + this.onRetryHandlerAsync = onRetryAsync; + this.PolicyBuilderBase = policyBuilder.policyBuilder.Defaults.PolicyBuilderBase; + } + + private static Action, TimeSpan, int, Context> Logger(ILogger logger) + { + return (outcome, timeSpan, retryCount, ctx) => + { + if (outcome.Exception is null) + { + logger?.LogError(@"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan:c} CorrelationId: {CorrelationId:D}", ctx[CallerMemberNameKey], retryCount, timeSpan, ctx.CorrelationId); + } + else + { + logger?.LogError(outcome.Exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan:c} CorrelationId: {CorrelationId:D} ErrorMessage: {ExceptionMessage}", ctx[CallerMemberNameKey], retryCount, timeSpan, ctx.CorrelationId, outcome.Exception.Message); + } + }; + } + + private static Action, TimeSpan, int, Context> OnRetryHandler(ILogger logger, OnRetryHandler onRetryHandler) + { + var genericHandler = onRetryHandler ?? new((outcome, timeSpan, retryCount, correlationId, caller) => { }); - return policyBuilder.WaitAndRetryAsync(retryCount, waitProvider, onRetryProvider); + return (outcome, timeSpan, retryCount, ctx) => + { + genericHandler(outcome, timeSpan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); + Logger(logger)(outcome, timeSpan, retryCount, ctx); + }; + } + + private static Func, TimeSpan, int, Context, Task> OnRetryHandlerAsync(ILogger logger, OnRetryHandler onRetryHandler, OnRetryHandlerAsync onRetryHandlerAsync) + { + var genericHandler = onRetryHandler ?? new((outcome, timeSpan, retryCount, correlationId, caller) => { }); + var genericHandlerAsync = onRetryHandlerAsync ?? new((outcome, timespan, retryCount, correlationId, caller) => + { + genericHandler(outcome, timespan, retryCount, correlationId, caller); + return Task.CompletedTask; + }); + + return async (outcome, timespan, retryCount, ctx) => + { + await genericHandlerAsync(outcome, timespan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); + Logger(logger)(outcome, timespan, retryCount, ctx); + }; } /// ISyncPolicy IPolicyBuilderBuild.Build() { - var waitProvider = genericSleepDurationProvider; - var onRetryProvider = OnGenericRetryHandler(); + var waitProvider = sleepDurationProvider; + var onRetryProvider = OnRetryHandler(logger, onRetryHandler); - return genericPolicyBuilder.WaitAndRetry(retryCount, waitProvider, onRetryProvider); + return policyBuilder.policyBuilder.policyBuilder.WaitAndRetry(policyBuilder.retryCount, waitProvider, onRetryProvider); } /// IAsyncPolicy IPolicyBuilderBuild.BuildAsync() { - var waitProvider = genericSleepDurationProvider; - var onRetryProvider = OnGenericRetryHandlerAsync(); + var waitProvider = sleepDurationProvider; + var onRetryProvider = OnRetryHandlerAsync(logger, onRetryHandler, onRetryHandlerAsync); - return genericPolicyBuilder.WaitAndRetryAsync(retryCount, waitProvider, onRetryProvider); + return policyBuilder.policyBuilder.policyBuilder.WaitAndRetryAsync(policyBuilder.retryCount, waitProvider, onRetryProvider); } } #endregion From 8b5b58dd77e7cec52cca9cc37bc0da104cc37ed1 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 29 Nov 2023 11:18:12 -0800 Subject: [PATCH 101/202] Refactor TestServices --- .../TerraWsmApiClientTests.cs | 2 +- ...sage.cs => RetryHandlersTResultHelpers.cs} | 24 +++++++++++-------- 2 files changed, 15 insertions(+), 11 deletions(-) rename src/Tes.ApiClients.Tests/TestServices/{RetryHandlersHelpers-HttpResponseMessage.cs => RetryHandlersTResultHelpers.cs} (63%) diff --git a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs index fbe79fbed..79eb79c90 100644 --- a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs +++ b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs @@ -30,7 +30,7 @@ public void SetUp() var cache = new Mock(); cache.Setup(c => c.CreateEntry(It.IsAny())).Returns(new Mock().Object); cacheAndRetryHandler.SetupGet(c => c.AppCache).Returns(cache.Object); - asyncResponseRetryPolicy = new(TestServices.RetryHandlersHelpers.GetCachingHttpResponseMessageAsyncRetryPolicyMock(cacheAndRetryHandler)); + asyncResponseRetryPolicy = new(TestServices.RetryHandlersHelpers.GetCachingAsyncRetryPolicyMock(cacheAndRetryHandler, retryHandler => retryHandler.RetryDefaultHttpResponseMessagePolicyBuilder())); asyncRetryPolicy = new(TestServices.RetryHandlersHelpers.GetCachingAsyncRetryPolicyMock(cacheAndRetryHandler)); terraWsmApiClient = new TerraWsmApiClient(TerraApiStubData.WsmApiHost, tokenCredential.Object, cacheAndRetryHandler.Object, NullLogger.Instance); diff --git a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers-HttpResponseMessage.cs b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersTResultHelpers.cs similarity index 63% rename from src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers-HttpResponseMessage.cs rename to src/Tes.ApiClients.Tests/TestServices/RetryHandlersTResultHelpers.cs index 1e183f18b..9d352c49f 100644 --- a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers-HttpResponseMessage.cs +++ b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersTResultHelpers.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +using System.Linq.Expressions; using Moq; using Polly; using Polly.Retry; @@ -11,23 +12,26 @@ namespace Tes.ApiClients.Tests.TestServices { internal static partial class RetryHandlersHelpers { - internal static Mock> GetCachingHttpResponseMessageAsyncRetryPolicyMock(Mock cachingRetryHandler) + // TODO: Add ability to use a mocked ILogger with a mocked CachingRetryHandler where failures in the mocked retry handlers call the mocked ILogger. + // The opt-in would be an optional argument like this: "Microsoft.Extensions.Logging.ILogger logger". + + internal static Mock> GetCachingAsyncRetryPolicyMock(Mock cachingRetryHandler, Expression>> retryDefaultTResultPolicyBuilder) { - var cachingAsyncRetryPolicy = new Mock>(); + var cachingAsyncRetryPolicy = new Mock>(); _ = cachingAsyncRetryPolicy.As(); - var cachingAsyncPolicy = cachingAsyncRetryPolicy.As>(); + var cachingAsyncPolicy = cachingAsyncRetryPolicy.As>(); var cachingPolicy = cachingAsyncPolicy.As(); - _ = cachingAsyncRetryPolicy.As>(); - var cachingPolicyBuild = new Mock>(); + _ = cachingAsyncRetryPolicy.As>(); + var cachingPolicyBuild = new Mock>(); cachingPolicyBuild.Setup(policy => policy.BuildAsync()) .Returns(cachingAsyncRetryPolicy.Object); - cachingRetryHandler.As().Setup(policy => policy.CachingPolicyBuilder(It.IsAny>())) + cachingRetryHandler.As().Setup(policy => policy.CachingPolicyBuilder(It.IsAny>())) .Returns(cachingPolicyBuild.Object); - var builderBuild = new Mock>(); - var policyBuilderWait = new Mock>(); - policyBuilderWait.Setup(policy => policy.SetOnRetryBehavior(It.IsAny(), It.IsAny>(), It.IsAny>())) + var builderBuild = new Mock>(); + var policyBuilderWait = new Mock>(); + policyBuilderWait.Setup(policy => policy.SetOnRetryBehavior(It.IsAny(), It.IsAny>(), It.IsAny>())) .Returns(builderBuild.Object); - cachingRetryHandler.Setup(cachingRetryHandler => cachingRetryHandler.RetryDefaultHttpResponseMessagePolicyBuilder()) + cachingRetryHandler.Setup(retryDefaultTResultPolicyBuilder) .Returns(policyBuilderWait.Object); builderBuild.Setup(policy => policy.PolicyBuilderBase) .Returns(cachingRetryHandler.Object); From 371be0814bba6c019a374ab2d7ba17d44238b4d7 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 30 Nov 2023 01:40:14 -0800 Subject: [PATCH 102/202] Refactor for testability and to ensure that retries are logged when possible --- .../CacheAndRetryHandlerTest.cs | 6 +- .../TerraWsmApiClientTests.cs | 35 +- .../TestServices/RetryHandlersHelpers.cs | 58 ++- .../RetryHandlersTResultHelpers.cs | 45 -- src/Tes.ApiClients/CachingRetryHandler.cs | 402 +++++++++++------- src/Tes.ApiClients/HttpApiClient.cs | 39 +- src/Tes.ApiClients/RetryHandler.cs | 382 +++++++++++------ src/Tes.Runner/Docker/DockerExecutor.cs | 6 +- .../ContainerRegistryProviderTests.cs | 12 +- src/TesApi.Web/AzureProxy.cs | 11 +- .../CachingWithRetriesAzureProxy.cs | 12 +- .../Extensions/PagedInterfaceExtensions.cs | 16 +- src/TesApi.Web/Management/AzureProvider.cs | 2 +- 13 files changed, 608 insertions(+), 418 deletions(-) delete mode 100644 src/Tes.ApiClients.Tests/TestServices/RetryHandlersTResultHelpers.cs diff --git a/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs b/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs index 18c04a853..a5f6a61f1 100644 --- a/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs +++ b/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs @@ -13,8 +13,8 @@ namespace Tes.ApiClients.Tests; public class CacheAndRetryHandlerTest { private IMemoryCache appCache = null!; - private CachingRetryHandler.ICachingAsyncPolicy cachingAsyncPolicy = null!; - private CachingRetryHandler.ICachingAsyncPolicy cachingAsyncHttpResponseMessagePolicy = null!; + private CachingRetryHandler.CachingAsyncRetryHandlerPolicy cachingAsyncPolicy = null!; + private RetryHandler.AsyncRetryHandlerPolicy cachingAsyncHttpResponseMessagePolicy = null!; private Mock mockInstanceToRetry = null!; private const int MaxRetryCount = 3; @@ -30,7 +30,7 @@ public void SetUp() cachingAsyncHttpResponseMessagePolicy = cachingRetryHandler .RetryDefaultHttpResponseMessagePolicyBuilder() .SetOnRetryBehavior() - .AddCaching() + //.AddCaching() .BuildAsync(); cachingAsyncPolicy = cachingRetryHandler diff --git a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs index 79eb79c90..ba16f045f 100644 --- a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs +++ b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs @@ -17,8 +17,7 @@ public class TerraWsmApiClientTests private TerraWsmApiClient terraWsmApiClient = null!; private Mock tokenCredential = null!; private Mock cacheAndRetryHandler = null!; - private Lazy> asyncRetryPolicy = null!; - private Lazy>> asyncResponseRetryPolicy = null!; + private Lazy>> asyncResponseRetryPolicy = null!; private TerraApiStubData terraApiStubData = null!; [TestInitialize] @@ -30,8 +29,7 @@ public void SetUp() var cache = new Mock(); cache.Setup(c => c.CreateEntry(It.IsAny())).Returns(new Mock().Object); cacheAndRetryHandler.SetupGet(c => c.AppCache).Returns(cache.Object); - asyncResponseRetryPolicy = new(TestServices.RetryHandlersHelpers.GetCachingAsyncRetryPolicyMock(cacheAndRetryHandler, retryHandler => retryHandler.RetryDefaultHttpResponseMessagePolicyBuilder())); - asyncRetryPolicy = new(TestServices.RetryHandlersHelpers.GetCachingAsyncRetryPolicyMock(cacheAndRetryHandler)); + asyncResponseRetryPolicy = new(TestServices.RetryHandlersHelpers.GetCachingAsyncRetryPolicyMock(cacheAndRetryHandler, c => c.RetryDefaultHttpResponseMessagePolicyBuilder())); terraWsmApiClient = new TerraWsmApiClient(TerraApiStubData.WsmApiHost, tokenCredential.Object, cacheAndRetryHandler.Object, NullLogger.Instance); } @@ -96,12 +94,9 @@ public async Task GetSasTokenAsync_ValidRequest_ReturnsPayload() Content = new StringContent(terraApiStubData.GetWsmSasTokenApiResponseInJson()) }; - asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); - asyncRetryPolicy.Value.Setup(c => c.ExecuteAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) - .Returns((Func> action, Polly.Context context, CancellationToken cancellationToken) => action(context, cancellationToken)); - var apiResponse = await terraWsmApiClient.GetSasTokenAsync(terraApiStubData.WorkspaceId, terraApiStubData.ContainerResourceId, null!, CancellationToken.None); @@ -119,13 +114,9 @@ public async Task GetContainerResourcesAsync_ValidRequest_ReturnsPayload() }; asyncResponseRetryPolicy.Value - .Setup(c => c.ExecuteAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + .Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); - asyncRetryPolicy.Value - .Setup(c => c.ExecuteAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) - .Returns((Func> action, Polly.Context context, CancellationToken cancellationToken) => action(context, cancellationToken)); - var apiResponse = await terraWsmApiClient.GetContainerResourcesAsync(terraApiStubData.WorkspaceId, offset: 0, limit: 10, CancellationToken.None); @@ -141,7 +132,7 @@ public async Task DeleteBatchPoolAsync_204Response_Succeeds() var response = new HttpResponseMessage(HttpStatusCode.NoContent); asyncResponseRetryPolicy.Value - .Setup(c => c.ExecuteAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + .Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); await terraWsmApiClient.DeleteBatchPoolAsync(terraApiStubData.WorkspaceId, wsmResourceId, CancellationToken.None); @@ -164,12 +155,8 @@ public async Task GetResourceQuotaAsync_ValidResourceIdReturnsQuotaInformationAn { var body = terraApiStubData.GetResourceQuotaApiResponseInJson(); - asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteAsync( - It.IsAny>>(), It.IsAny(), It.IsAny())) - .ReturnsAsync(new HttpResponseMessage()); - - asyncRetryPolicy.Value.Setup(c => c.ExecuteAsync( - It.IsAny>>(), It.IsAny(), It.IsAny())) + asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteWithRetryAndCachingAsync(It.IsAny(), + It.IsAny>>(), It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(body); var quota = await terraWsmApiClient.GetResourceQuotaAsync(terraApiStubData.WorkspaceId, terraApiStubData.BatchAccountId, cacheResults: true, cancellationToken: CancellationToken.None); @@ -195,12 +182,8 @@ public async Task GetLandingZoneResourcesAsync_ListOfLandingZoneResourcesAndGets { var body = terraApiStubData.GetResourceApiResponseInJson(); - asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteAsync( - It.IsAny>>(), It.IsAny(), It.IsAny())) - .ReturnsAsync(new HttpResponseMessage()); - - asyncRetryPolicy.Value.Setup(c => c.ExecuteAsync( - It.IsAny>>(), It.IsAny(), It.IsAny())) + asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteWithRetryAndCachingAsync(It.IsAny(), + It.IsAny>>(), It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(body); var resources = await terraWsmApiClient.GetLandingZoneResourcesAsync(terraApiStubData.WorkspaceId, CancellationToken.None); diff --git a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs index 1b3ce7948..e24b25726 100644 --- a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs +++ b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs @@ -1,9 +1,10 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +using System; +using System.Linq.Expressions; using Moq; using Polly; -using Polly.Retry; using static Tes.ApiClients.CachingRetryHandler; using static Tes.ApiClients.RetryHandler; @@ -14,30 +15,59 @@ internal static partial class RetryHandlersHelpers // TODO: Add ability to use a mocked ILogger with a mocked CachingRetryHandler where failures in the mocked retry handlers call the mocked ILogger. // The opt-in would be an optional argument like this: "Microsoft.Extensions.Logging.ILogger logger". - internal static Mock GetCachingAsyncRetryPolicyMock(Mock cachingRetryHandler) + internal static Mock GetCachingAsyncRetryPolicyMock(Mock cachingRetryHandler) { - var cachingAsyncRetryPolicy = new Mock(); - _ = cachingAsyncRetryPolicy.As(); - var cachingAsyncPolicy = cachingAsyncRetryPolicy.As(); - var cachingPolicy = cachingAsyncPolicy.As(); - _ = new Mock(); + var cachingAsyncRetryPolicy = new Mock(); + _ = cachingAsyncRetryPolicy.As(); var cachingPolicyBuild = new Mock(); cachingPolicyBuild.Setup(policy => policy.BuildAsync()) .Returns(cachingAsyncRetryPolicy.Object); cachingRetryHandler.As().Setup(policy => policy.CachingPolicyBuilder(It.IsAny())) .Returns(cachingPolicyBuild.Object); + var builderBuild = new Mock(); - var policyBuilderWait = new Mock(); - policyBuilderWait.Setup(policy => policy.SetOnRetryBehavior(It.IsAny(), It.IsAny(), It.IsAny())) - .Returns(builderBuild.Object); - cachingRetryHandler.Setup(cachingRetryHandler => cachingRetryHandler.RetryDefaultPolicyBuilder()) - .Returns(policyBuilderWait.Object); builderBuild.Setup(policy => policy.PolicyBuilderBase) .Returns(cachingRetryHandler.Object); - cachingPolicy.Setup(c => c.Handler) + builderBuild.Setup(c => c.BuildAsyncPolicy()) + .Returns((IAsyncPolicy)cachingAsyncRetryPolicy.Object); + var builderWait = new Mock(); + builderWait.Setup(c => c.SetOnRetryBehavior(It.IsAny(), It.IsAny(), It.IsAny())) + .Returns(builderBuild.Object); + + cachingRetryHandler.Setup(c => c.RetryDefaultPolicyBuilder()) + .Returns(builderWait.Object); + + cachingAsyncRetryPolicy.Setup(c => c.AppCache) + .Returns(cachingRetryHandler.Object.AppCache); + + return cachingAsyncRetryPolicy; + } + + internal static Mock> GetCachingAsyncRetryPolicyMock(Mock cachingRetryHandler, Expression>> expression) + { + var cachingAsyncRetryPolicy = new Mock>(); + _ = cachingAsyncRetryPolicy.As>(); + var cachingPolicyBuild = new Mock>(); + cachingPolicyBuild.Setup(policy => policy.BuildAsync()) + .Returns(cachingAsyncRetryPolicy.Object); + cachingRetryHandler.As().Setup(policy => policy.CachingPolicyBuilder(It.IsAny>())) + .Returns(cachingPolicyBuild.Object); + + var builderBuild = new Mock>(); + builderBuild.Setup(policy => policy.PolicyBuilderBase) .Returns(cachingRetryHandler.Object); - cachingPolicy.Setup(c => c.AppCache) + builderBuild.Setup(c => c.BuildAsyncPolicy()) + .Returns((IAsyncPolicy)cachingAsyncRetryPolicy.Object); + var builderWait = new Mock>(); + builderWait.Setup(c => c.SetOnRetryBehavior(It.IsAny(), It.IsAny>(), It.IsAny>())) + .Returns(builderBuild.Object); + + cachingRetryHandler.Setup(expression) + .Returns(builderWait.Object); + + cachingAsyncRetryPolicy.Setup(c => c.AppCache) .Returns(cachingRetryHandler.Object.AppCache); + return cachingAsyncRetryPolicy; } } diff --git a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersTResultHelpers.cs b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersTResultHelpers.cs deleted file mode 100644 index 9d352c49f..000000000 --- a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersTResultHelpers.cs +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System.Linq.Expressions; -using Moq; -using Polly; -using Polly.Retry; -using static Tes.ApiClients.CachingRetryHandler; -using static Tes.ApiClients.RetryHandler; - -namespace Tes.ApiClients.Tests.TestServices -{ - internal static partial class RetryHandlersHelpers - { - // TODO: Add ability to use a mocked ILogger with a mocked CachingRetryHandler where failures in the mocked retry handlers call the mocked ILogger. - // The opt-in would be an optional argument like this: "Microsoft.Extensions.Logging.ILogger logger". - - internal static Mock> GetCachingAsyncRetryPolicyMock(Mock cachingRetryHandler, Expression>> retryDefaultTResultPolicyBuilder) - { - var cachingAsyncRetryPolicy = new Mock>(); - _ = cachingAsyncRetryPolicy.As(); - var cachingAsyncPolicy = cachingAsyncRetryPolicy.As>(); - var cachingPolicy = cachingAsyncPolicy.As(); - _ = cachingAsyncRetryPolicy.As>(); - var cachingPolicyBuild = new Mock>(); - cachingPolicyBuild.Setup(policy => policy.BuildAsync()) - .Returns(cachingAsyncRetryPolicy.Object); - cachingRetryHandler.As().Setup(policy => policy.CachingPolicyBuilder(It.IsAny>())) - .Returns(cachingPolicyBuild.Object); - var builderBuild = new Mock>(); - var policyBuilderWait = new Mock>(); - policyBuilderWait.Setup(policy => policy.SetOnRetryBehavior(It.IsAny(), It.IsAny>(), It.IsAny>())) - .Returns(builderBuild.Object); - cachingRetryHandler.Setup(retryDefaultTResultPolicyBuilder) - .Returns(policyBuilderWait.Object); - builderBuild.Setup(policy => policy.PolicyBuilderBase) - .Returns(cachingRetryHandler.Object); - cachingPolicy.Setup(c => c.Handler) - .Returns(cachingRetryHandler.Object); - cachingPolicy.Setup(c => c.AppCache) - .Returns(cachingRetryHandler.Object.AppCache); - return cachingAsyncRetryPolicy; - } - } -} diff --git a/src/Tes.ApiClients/CachingRetryHandler.cs b/src/Tes.ApiClients/CachingRetryHandler.cs index 4e17d1e52..62512d65d 100644 --- a/src/Tes.ApiClients/CachingRetryHandler.cs +++ b/src/Tes.ApiClients/CachingRetryHandler.cs @@ -6,122 +6,261 @@ using Polly; using Polly.Retry; using Tes.ApiClients.Options; -using static Tes.ApiClients.CachingRetryHandler; namespace Tes.ApiClients { /// - /// Extension methods for + /// Contains an App Cache instances and retry policies. /// - public static class CachingRetryHandlerExtensions + public partial class CachingRetryHandler : RetryHandler, CachingRetryHandler.ICachingPolicyBuilderHandler { + private readonly IMemoryCache appCache = null!; + public virtual IMemoryCache AppCache => appCache; + /// - /// Executes a delegate with the specified async retry policy and persisting the result in a cache. + /// Contains an App Cache instances and retry policies. /// - /// Asynchronous caching retry policy - /// - /// Action to execute - /// A for controlling the lifetime of the asynchronous operation. - /// Name of method originating the retriable operation. - /// - public static Task ExecuteWithRetryAndCachingAsync(this ICachingAsyncPolicy retryPolicy, string cacheKey, Func> action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + /// > + /// + public CachingRetryHandler(IMemoryCache appCache, IOptions retryPolicyOptions) : base(retryPolicyOptions) { - ArgumentNullException.ThrowIfNull(retryPolicy); - ValidateArgs(cacheKey, action); + ArgumentNullException.ThrowIfNull(appCache); - return retryPolicy.Handler.ExecuteWithCacheAsync(cacheKey, () => retryPolicy.ExecuteWithRetryAsync(action, cancellationToken, caller)); + this.appCache = appCache; } /// - /// Executes a delegate with the specified async retry policy and persisting the result in a cache. + /// Protected parameter-less constructor for mocking /// - /// Asynchronous caching retry policy - /// - /// Action to execute - /// - /// A for controlling the lifetime of the asynchronous operation. - /// Name of method originating the retriable operation. - /// - /// - public static Task ExecuteWithRetryAndCachingAsync(this ICachingAsyncPolicy retryPolicy, string cacheKey, Func> action, DateTimeOffset cachesExpires, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) - { - ArgumentNullException.ThrowIfNull(retryPolicy); - ValidateArgs(cacheKey, action); + protected CachingRetryHandler() { } - return retryPolicy.Handler.ExecuteWithCacheAsync(cacheKey, () => retryPolicy.ExecuteWithRetryAsync(action, cancellationToken, caller), cachesExpires); - } + #region CachingRetryHandlerPolicies - public static ICachingPolicyBuilderBuild AddCaching(this RetryHandler.IPolicyBuilderBuild policyBuilder) + public class CachingRetryHandlerPolicy : RetryHandlerPolicy, ICachingPolicy { - return ((ICachingPolicyBuilderHandler)policyBuilder.PolicyBuilderBase).CachingPolicyBuilder(policyBuilder); + private readonly IMemoryCache appCache; + + public CachingRetryHandlerPolicy(ISyncPolicy retryPolicy, IMemoryCache appCache) + : base(retryPolicy) + => this.appCache = appCache; + + /// For mocking + public CachingRetryHandlerPolicy() { } + + + /// + /// App cache instance. + /// + public virtual IMemoryCache AppCache => appCache; + + /// + /// Executes a delegate with the specified policy. + /// + /// Synchronous retry policy. + /// Action to execute. + /// Name of method originating the retriable operation. + /// instance. + public TResult ExecuteWithRetryAndCaching(string cacheKey, Func action, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + ArgumentNullException.ThrowIfNull(action); + + return appCache.GetOrCreate(cacheKey, _ => ExecuteWithRetry(action, caller)); + } } - public static ICachingPolicyBuilderBuild AddCaching(this RetryHandler.IPolicyBuilderBuild policyBuilder) + public class CachingAsyncRetryHandlerPolicy : AsyncRetryHandlerPolicy, ICachingPolicy { - return ((ICachingPolicyBuilderHandler)policyBuilder.PolicyBuilderBase).CachingPolicyBuilder(policyBuilder); + private readonly IMemoryCache appCache; + + public CachingAsyncRetryHandlerPolicy(IAsyncPolicy retryPolicy, IMemoryCache appCache) + : base(retryPolicy) + => this.appCache = appCache; + + /// For mocking + public CachingAsyncRetryHandlerPolicy() { } + + + /// + /// App cache instance. + /// + public virtual IMemoryCache AppCache => appCache; + + /// + /// Executes a delegate with the specified async retry policy and persisting the result in a cache. + /// + /// + /// Action to execute + /// A for controlling the lifetime of the asynchronous operation. + /// Name of method originating the retriable operation. + /// + public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + ValidateArgs(cacheKey, action); + + return await ExecuteWithCacheAsync(appCache, cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken, caller)); + } + + /// + /// Executes a delegate with the specified async retry policy and persisting the result in a cache. + /// + /// + /// Action to execute + /// + /// A for controlling the lifetime of the asynchronous operation. + /// + /// + public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, DateTimeOffset cachesExpires, CancellationToken cancellationToken) + { + ValidateArgs(cacheKey, action); + + return await ExecuteWithCacheAsync(appCache, cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken), cachesExpires); + } } - } + //public class CachingRetryHandlerPolicy : RetryHandlerPolicy, ICachingPolicy + //{ + // private readonly IMemoryCache appCache; - /// - /// Contains an App Cache instances and retry policies. - /// - public partial class CachingRetryHandler : RetryHandler, ICachingPolicyBuilderHandler - { - private readonly IMemoryCache appCache = null!; + // public CachingRetryHandlerPolicy(ISyncPolicy retryPolicy, IMemoryCache appCache) + // : base(retryPolicy) + // => this.appCache = appCache; - /// - /// App cache instance. - /// - public virtual IMemoryCache AppCache => appCache; + // /// For mocking + // public CachingRetryHandlerPolicy() { } - /// - /// Contains an App Cache instances and retry policies. - /// - /// > - /// - public CachingRetryHandler(IMemoryCache appCache, IOptions retryPolicyOptions) : base(retryPolicyOptions) + + // /// + // /// App cache instance. + // /// + // public virtual IMemoryCache AppCache => appCache; + //} + + public class CachingAsyncRetryHandlerPolicy : AsyncRetryHandlerPolicy, ICachingPolicy { - ArgumentNullException.ThrowIfNull(appCache); + private readonly IMemoryCache appCache; + + public CachingAsyncRetryHandlerPolicy(IAsyncPolicy retryPolicy, IMemoryCache appCache) + : base(retryPolicy) + => this.appCache = appCache; + + /// For mocking + public CachingAsyncRetryHandlerPolicy() { } + + + /// + /// App cache instance. + /// + public virtual IMemoryCache AppCache => appCache; + + /// + /// Executes a delegate with the specified async retry policy and persisting the result in a cache. + /// + /// + /// Action to execute + /// A for controlling the lifetime of the asynchronous operation. + /// Name of method originating the retriable operation. + /// + public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + ValidateArgs(cacheKey, action); - this.appCache = appCache; - } + return await ExecuteWithCacheAsync(appCache, cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken, caller)); + } - /// - /// Protected parameter-less constructor for mocking - /// - protected CachingRetryHandler() { } + /// + /// Executes a delegate with the specified async retry policy and persisting the result in a cache. + /// + /// + /// Action to execute + /// + /// A for controlling the lifetime of the asynchronous operation. + /// + public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, DateTimeOffset cachesExpires, CancellationToken cancellationToken) + { + ValidateArgs(cacheKey, action); + + return await ExecuteWithCacheAsync(appCache, cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken), cachesExpires); + } + + /// + /// Executes a delegate with the specified async retry policy and persisting the result in a cache. + /// + /// Final return type + /// + /// Action to execute + /// Method to convert + /// A for controlling the lifetime of the asynchronous operation. + /// Name of method originating the retriable operation. + /// + public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, Func> convert, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + ValidateArgs(cacheKey, action); + + return await ExecuteWithCacheAsync(appCache, cacheKey, async () => await convert(await ExecuteWithRetryAsync(action, cancellationToken, caller), cancellationToken)); + } + + /// + /// Executes a delegate with the specified async retry policy and persisting the result in a cache. + /// + /// Final return type + /// + /// Action to execute + /// Method to convert + /// + /// A for controlling the lifetime of the asynchronous operation. + /// Name of method originating the retriable operation. + /// + public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, Func> convert, DateTimeOffset cachesExpires, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + ValidateArgs(cacheKey, action); + + return await ExecuteWithCacheAsync(appCache, cacheKey, async () => await convert(await ExecuteWithRetryAsync(action, cancellationToken, caller), cancellationToken), cachesExpires); + } + } + #endregion #region Builder interfaces public interface ICachingPolicy { IMemoryCache AppCache { get; } - - /// Used internally and for testing. - public CachingRetryHandler Handler { get; } } - public interface ICachingSyncPolicy : ICachingPolicy, ISyncPolicy { } - public interface ICachingSyncPolicy : ICachingPolicy, ISyncPolicy { } - public interface ICachingAsyncPolicy : ICachingPolicy, IAsyncPolicy { } - public interface ICachingAsyncPolicy : ICachingPolicy, IAsyncPolicy { } - public interface ICachingPolicyBuilderBuild { - ICachingSyncPolicy Build(); - ICachingAsyncPolicy BuildAsync(); + /// + /// Builds with caching. + /// + /// Caching retry policy. + CachingRetryHandlerPolicy Build(); + + /// + /// Builds with caching. + /// + /// Caching retry policy. + CachingAsyncRetryHandlerPolicy BuildAsync(); } public interface ICachingPolicyBuilderBuild { - ICachingSyncPolicy Build(); - ICachingAsyncPolicy BuildAsync(); + ///// + ///// Builds with caching. + ///// + ///// Caching retry policy. + //CachingRetryHandlerPolicy Build(); + + /// + /// Builds with caching. + /// + /// Caching retry policy. + CachingAsyncRetryHandlerPolicy BuildAsync(); } /// Used internally and for testing. public interface ICachingPolicyBuilderHandler { + /// Used internally and for testing. ICachingPolicyBuilderBuild CachingPolicyBuilder(IPolicyBuilderBuild policyBuilder); + /// Used internally and for testing. ICachingPolicyBuilderBuild CachingPolicyBuilder(IPolicyBuilderBuild policyBuilder); } #endregion @@ -145,14 +284,14 @@ public CachingPolicyBuilderBuild(IPolicyBuilderBuild policyBuilder, CachingRetry this.cachingHandler = handler; } - public ICachingSyncPolicy Build() + CachingRetryHandlerPolicy ICachingPolicyBuilderBuild.Build() { - return new CachingRetryPolicy(cachingHandler, policyBuilder.Build()); + return new(policyBuilder.BuildPolicy(), cachingHandler.AppCache); } - public ICachingAsyncPolicy BuildAsync() + CachingAsyncRetryHandlerPolicy ICachingPolicyBuilderBuild.BuildAsync() { - return new CachingAsyncRetryPolicy(cachingHandler, policyBuilder.BuildAsync()); + return new(policyBuilder.BuildAsyncPolicy(), cachingHandler.AppCache); } } @@ -168,89 +307,14 @@ public CachingPolicyBuilderBuild(IPolicyBuilderBuild policyBuilder, Cac this.cachingHandler = handler; } - ICachingSyncPolicy ICachingPolicyBuilderBuild.Build() - { - return new CachingRetryPolicy(cachingHandler, policyBuilder.Build()); - } - - ICachingAsyncPolicy ICachingPolicyBuilderBuild.BuildAsync() - { - return new CachingAsyncRetryPolicy(cachingHandler, policyBuilder.BuildAsync()); - } - } - - private partial class CachingRetryPolicy : IRetryPolicy, ICachingSyncPolicy - { - [BeaKona.AutoInterface(typeof(ISyncPolicy), IncludeBaseInterfaces = true)] - [BeaKona.AutoInterface(typeof(IRetryPolicy), IncludeBaseInterfaces = true)] - private readonly ISyncPolicy policy; - private readonly CachingRetryHandler handler; - - public IMemoryCache AppCache => handler.AppCache; - CachingRetryHandler ICachingPolicy.Handler => handler; - - public CachingRetryPolicy(CachingRetryHandler handler, ISyncPolicy policy) - { - ArgumentNullException.ThrowIfNull(policy); - this.policy = policy; - this.handler = handler; - } - } - - private partial class CachingRetryPolicy : IRetryPolicy, ICachingSyncPolicy - { - [BeaKona.AutoInterface(IncludeBaseInterfaces = true)] - private readonly ISyncPolicy policy; - [BeaKona.AutoInterface(IncludeBaseInterfaces = true)] - private readonly IRetryPolicy retryPolicy; - private readonly CachingRetryHandler handler; - - public IMemoryCache AppCache => handler.AppCache; - CachingRetryHandler ICachingPolicy.Handler => handler; - - public CachingRetryPolicy(CachingRetryHandler handler, ISyncPolicy policy) - { - ArgumentNullException.ThrowIfNull(policy); - retryPolicy = (IRetryPolicy)policy; - this.policy = policy; - this.handler = handler; - } - } - - private partial class CachingAsyncRetryPolicy : IRetryPolicy, ICachingAsyncPolicy - { - [BeaKona.AutoInterface(typeof(IAsyncPolicy), IncludeBaseInterfaces = true)] - [BeaKona.AutoInterface(typeof(IRetryPolicy), IncludeBaseInterfaces = true)] - private readonly IAsyncPolicy policy; - private readonly CachingRetryHandler handler; - - public IMemoryCache AppCache => handler.AppCache; - CachingRetryHandler ICachingPolicy.Handler => handler; + //CachingRetryHandlerPolicy ICachingPolicyBuilderBuild.Build() + //{ + // return new(policyBuilder.BuildPolicy(), cachingHandler.AppCache); + //} - public CachingAsyncRetryPolicy(CachingRetryHandler handler, IAsyncPolicy policy) + CachingAsyncRetryHandlerPolicy ICachingPolicyBuilderBuild.BuildAsync() { - ArgumentNullException.ThrowIfNull(policy); - this.policy = policy; - this.handler = handler; - } - } - - private partial class CachingAsyncRetryPolicy : IRetryPolicy, ICachingAsyncPolicy - { - [BeaKona.AutoInterface(IncludeBaseInterfaces = true)] - private readonly IAsyncPolicy policy; - [BeaKona.AutoInterface(IncludeBaseInterfaces = true)] - private readonly IRetryPolicy retryPolicy; - private readonly CachingRetryHandler handler; - - public IMemoryCache AppCache => handler.AppCache; - CachingRetryHandler ICachingPolicy.Handler => handler; - - public CachingAsyncRetryPolicy(CachingRetryHandler handler, IAsyncPolicy policy) - { - retryPolicy = (IRetryPolicy)policy; - this.policy = policy; - this.handler = handler; + return new(policyBuilder.BuildAsyncPolicy(), cachingHandler.AppCache); } } #endregion @@ -265,14 +329,40 @@ internal static void ValidateArgs(string cacheKey, Func } } - internal async Task ExecuteWithCacheAsync(string cacheKey, Func> action) + private static async Task ExecuteWithCacheAsync(IMemoryCache appCache, string cacheKey, Func> action) => await appCache.GetOrCreateAsync(cacheKey, _ => action()); - internal async Task ExecuteWithCacheAsync(string cacheKey, Func> action, DateTimeOffset cacheExpires) + private static async Task ExecuteWithCacheAsync(IMemoryCache appCache, string cacheKey, Func> action, DateTimeOffset cacheExpires) => await appCache.GetOrCreateAsync(cacheKey, entry => { entry.AbsoluteExpiration = cacheExpires; return action(); }); } + + /// + /// Extension methods for + /// + public static class CachingRetryHandlerExtensions + { + /// + /// Default caching policy. + /// + /// policy builder. + /// OnRetry builder + public static CachingRetryHandler.ICachingPolicyBuilderBuild AddCaching(this RetryHandler.IPolicyBuilderBuild policyBuilder) + { + return ((CachingRetryHandler.ICachingPolicyBuilderHandler)policyBuilder.PolicyBuilderBase).CachingPolicyBuilder(policyBuilder); + } + + /// + /// Default caching policy. + /// + /// policy builder. + /// OnRetry builder + public static CachingRetryHandler.ICachingPolicyBuilderBuild AddCaching(this RetryHandler.IPolicyBuilderBuild policyBuilder) + { + return ((CachingRetryHandler.ICachingPolicyBuilderHandler)policyBuilder.PolicyBuilderBase).CachingPolicyBuilder(policyBuilder); + } + } } diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index aabda7124..c320657b7 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -6,7 +6,6 @@ using System.Text; using System.Text.Json; using Azure.Core; -using Microsoft.Extensions.Caching.Memory; using Microsoft.Extensions.Logging; namespace Tes.ApiClients @@ -27,8 +26,7 @@ public abstract class HttpApiClient private readonly SemaphoreSlim semaphore = new(1, 1); private AccessToken accessToken; - protected readonly CachingRetryHandler.ICachingAsyncPolicy AsyncCachingHttpResponseMessageRetryPolicy; - protected readonly CachingRetryHandler.ICachingAsyncPolicy AsyncCachingRetryPolicy; + protected readonly CachingRetryHandler.CachingAsyncRetryHandlerPolicy AsyncCachingHttpResponseMessageRetryPolicy; /// /// Inner http client. @@ -47,12 +45,6 @@ protected HttpApiClient(CachingRetryHandler cachingRetryHandler, ILogger logger) this.Logger = logger; - AsyncCachingRetryPolicy = cachingRetryHandler - .RetryDefaultPolicyBuilder() - .SetOnRetryBehavior(logger: this.Logger) - .AddCaching() - .BuildAsync(); - AsyncCachingHttpResponseMessageRetryPolicy = cachingRetryHandler .RetryDefaultHttpResponseMessagePolicyBuilder() .SetOnRetryBehavior(onRetry: LogRetryErrorOnRetryHttpResponseMessageHandler()) @@ -171,8 +163,16 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri re { var cacheKey = await ToCacheKeyAsync(requestUrl, setAuthorizationHeader, cancellationToken); - return (await AsyncCachingRetryPolicy.AppCache.GetOrCreateAsync(cacheKey, - async _ => await HttpGetRequestWithRetryPolicyAsync(requestUrl, cancellationToken, setAuthorizationHeader)))!; + return (await AsyncCachingHttpResponseMessageRetryPolicy.ExecuteWithRetryAndCachingAsync(cacheKey, + async ct => + { + //request must be recreated in every retry. + var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); + + var httpResponse = await HttpClient.SendAsync(httpRequest, ct); + return httpResponse.EnsureSuccessStatusCode(); + }, + ReadResponseBodyAsync, cancellationToken))!; } /// @@ -185,19 +185,16 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri re protected async Task HttpGetRequestWithRetryPolicyAsync(Uri requestUrl, CancellationToken cancellationToken, bool setAuthorizationHeader = false) { - return await AsyncCachingRetryPolicy.ExecuteWithRetryAsync(async token => + var response = await AsyncCachingHttpResponseMessageRetryPolicy.ExecuteWithRetryAsync(async ct => { - var response = await AsyncCachingHttpResponseMessageRetryPolicy.ExecuteWithRetryAsync(async ct => - { - //request must be recreated in every retry. - var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); + //request must be recreated in every retry. + var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); - var httpResponse = await HttpClient.SendAsync(httpRequest, ct); - return httpResponse.EnsureSuccessStatusCode(); - }, token); - - return await ReadResponseBodyAsync(response, token); + var httpResponse = await HttpClient.SendAsync(httpRequest, ct); + return httpResponse.EnsureSuccessStatusCode(); }, cancellationToken); + + return await ReadResponseBodyAsync(response, cancellationToken); } /// diff --git a/src/Tes.ApiClients/RetryHandler.cs b/src/Tes.ApiClients/RetryHandler.cs index 3ea7436db..78ce1fa30 100644 --- a/src/Tes.ApiClients/RetryHandler.cs +++ b/src/Tes.ApiClients/RetryHandler.cs @@ -10,83 +10,6 @@ namespace Tes.ApiClients; -/// -/// Extension methods for -/// -public static class RetryHandlerExtensions -{ - /// - /// Executes a delegate with the specified policy. - /// - /// Result type. - /// Synchronous retry policy. - /// Action to execute. - /// Name of method originating the retriable operation. - /// instance. - public static TResult ExecuteWithRetry(this ISyncPolicy retryPolicy, Func action, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) - { - ArgumentNullException.ThrowIfNull(retryPolicy); - ArgumentNullException.ThrowIfNull(action); - - return retryPolicy.Execute(_ => action(), PrepareContext(caller)); - } - - /// - /// Executes a delegate with the specified async policy. - /// - /// Result type. - /// Asynchronous retry policy. - /// Action to execute. - /// A for controlling the lifetime of the asynchronous operation. - /// Name of method originating the retriable operation. - /// instance. - public static Task ExecuteWithRetryAsync(this IAsyncPolicy retryPolicy, Func> action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) - { - ArgumentNullException.ThrowIfNull(retryPolicy); - ArgumentNullException.ThrowIfNull(action); - - return retryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(caller), cancellationToken); - } - - - /// - /// Executes a delegate with the specified async policy. - /// - /// Asynchronous retry policy. - /// Action to execute. - /// A for controlling the lifetime of the asynchronous operation. - /// Name of method originating the retriable operation. - /// - public static Task ExecuteWithRetryAsync(this IAsyncPolicy retryPolicy, Func action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) - { - ArgumentNullException.ThrowIfNull(retryPolicy); - ArgumentNullException.ThrowIfNull(action); - - return retryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(caller), cancellationToken); - } - - /// - /// Executes a delegate with the specified async policy. - /// - /// Asynchronous retry policy - /// Action to execute - /// A for controlling the lifetime of the asynchronous operation. - /// Name of method originating the retriable operation. - /// Result HttpResponse - public static Task ExecuteWithRetryAsync(this IAsyncPolicy retryPolicy, Func> action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) - { - ArgumentNullException.ThrowIfNull(retryPolicy); - ArgumentNullException.ThrowIfNull(action); - - return retryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(caller), cancellationToken); - } - - private static Context PrepareContext(string caller) => new() - { - [RetryHandler.CallerMemberNameKey] = caller - }; -} - /// /// Utility class that facilitates the retry policy implementations for HTTP clients. /// @@ -120,6 +43,10 @@ public virtual IPolicyBuilderWait RetryDefaultHttpResponseM .OpinionatedRetryPolicy(DefaultHttpResponseMessagePolicyBuilder) .WithRetryPolicyOptionsWait(); + /// + /// Public constructor + /// + /// Retry policy options public RetryHandler(IOptions retryPolicyOptions) { ArgumentNullException.ThrowIfNull(retryPolicyOptions); @@ -131,6 +58,170 @@ public RetryHandler(IOptions retryPolicyOptions) /// protected RetryHandler() { } + #region RetryHandlerPolicies + /// + /// Non-generic synchronous retry policy + /// + public class RetryHandlerPolicy + { + private readonly ISyncPolicy retryPolicy; + + public ISyncPolicy RetryPolicy => retryPolicy; + + /// + /// Public constructor + /// + /// Synchronous retry policy. + public RetryHandlerPolicy(ISyncPolicy retryPolicy) + { + ArgumentNullException.ThrowIfNull(retryPolicy); + this.retryPolicy = retryPolicy; + } + + /// For mocking + public RetryHandlerPolicy() { } + + + /// + /// Executes a delegate with the configured policy. + /// + /// Result type. + /// Action to execute. + /// Name of method originating the retriable operation. + /// instance. + public virtual TResult ExecuteWithRetry(Func action, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + ArgumentNullException.ThrowIfNull(action); + + return retryPolicy.Execute(_ => action(), RetryHandler.PrepareContext(caller)); + } + } + + ///// + ///// Generic synchronous retry policy + ///// + //public class RetryHandlerPolicy + //{ + // private readonly ISyncPolicy retryPolicy; + + // /// + // /// Public constructor + // /// + // /// Synchronous retry policy. + // public RetryHandlerPolicy(ISyncPolicy retryPolicy) + // { + // ArgumentNullException.ThrowIfNull(retryPolicy); + // this.retryPolicy = retryPolicy; + // } + + // /// For mocking + // public RetryHandlerPolicy() { } + + + // /// + // /// Executes a delegate with the configured policy. + // /// + // /// Action to execute. + // /// Name of method originating the retriable operation. + // /// + // public virtual TResult ExecuteWithRetry(Func action, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + // { + // ArgumentNullException.ThrowIfNull(action); + + // return retryPolicy.Execute(_ => action(), PrepareContext(caller)); + // } + //} + + /// + /// Non-generic asynchronous retry policy + /// + public class AsyncRetryHandlerPolicy + { + private readonly IAsyncPolicy retryPolicy; + + public IAsyncPolicy RetryPolicy => retryPolicy; + + /// + /// Public constructor + /// + /// Asynchronous retry policy. + public AsyncRetryHandlerPolicy(IAsyncPolicy retryPolicy) + { + ArgumentNullException.ThrowIfNull(retryPolicy); + this.retryPolicy = retryPolicy; + } + + /// For mocking + public AsyncRetryHandlerPolicy() { } + + + /// + /// Executes a delegate with the configured async policy. + /// + /// Result type. + /// Action to execute. + /// A for controlling the lifetime of the asynchronous operation. + /// Name of method originating the retriable operation. + /// instance. + public virtual Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + ArgumentNullException.ThrowIfNull(action); + + return retryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(caller), cancellationToken); + } + + /// + /// Executes a delegate with the configured async policy. + /// + /// Action to execute. + /// A for controlling the lifetime of the asynchronous operation. + /// Name of method originating the retriable operation. + /// + public virtual Task ExecuteWithRetryAsync(Func action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + ArgumentNullException.ThrowIfNull(action); + + return retryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(caller), cancellationToken); + } + } + + /// + /// Generic asynchronous retry policy + /// + public class AsyncRetryHandlerPolicy + { + private readonly IAsyncPolicy retryPolicy; + + /// + /// Public constructor + /// + /// Asynchronous retry policy. + public AsyncRetryHandlerPolicy(IAsyncPolicy retryPolicy) + { + ArgumentNullException.ThrowIfNull(retryPolicy); + this.retryPolicy = retryPolicy; + } + + /// For mocking + public AsyncRetryHandlerPolicy() { } + + + /// + /// Executes a delegate with the configured async policy. + /// + /// Action to execute + /// A for controlling the lifetime of the asynchronous operation. + /// Name of method originating the retriable operation. + /// + public virtual Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + ArgumentNullException.ThrowIfNull(action); + + return retryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(caller), cancellationToken); + } + } + #endregion + #region Builder interfaces public interface IPolicyBuilderPolicy { @@ -161,7 +252,7 @@ public interface IPolicyBuilderBase /// /// Default wait policy. /// - /// Wait policy. + /// OnRetry hander IPolicyBuilderWait WithRetryPolicyOptionsWait(); /// @@ -169,7 +260,7 @@ public interface IPolicyBuilderBase /// /// Maximum number of retries. /// Value in seconds which is raised by the power of the retry attempt. - /// Wait policy. + /// OnRetry hander IPolicyBuilderWait WithExponentialBackoffWait(int maxRetryCount, double exponentialBackOffExponent); /// @@ -177,7 +268,7 @@ public interface IPolicyBuilderBase /// /// Maximum number of retries. /// Wait policy. - /// Wait policy. + /// OnRetry hander IPolicyBuilderWait WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func waitDurationProvider); /// @@ -187,15 +278,15 @@ public interface IPolicyBuilderBase /// Maximum number of retries. /// Value in seconds which is raised by the power of the backup retry attempt. /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. - /// Wait policy. - IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, int maxRetryCount, int exponentialBackOffExponent, bool backupSkipProvidedIncrements); + /// OnRetry hander + IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, int maxRetryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements); /// /// Custom optional exception-based wait policy backed up by the default wait policy. /// /// Wait policy that can return to use the backup wait policy. /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. - /// Wait policy. + /// OnRetry hander IPolicyBuilderWait WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func waitDurationProvider, bool backupSkipProvidedIncrements); } @@ -204,7 +295,7 @@ public interface IPolicyBuilderBase /// /// Default wait policy. /// - /// Wait policy. + /// OnRetry hander IPolicyBuilderWait WithRetryPolicyOptionsWait(); /// @@ -212,7 +303,7 @@ public interface IPolicyBuilderBase /// /// Maximum number of retries. /// Value in seconds which is raised by the power of the retry attempt. - /// Wait policy. + /// OnRetry hander IPolicyBuilderWait WithExponentialBackoffWait(int retryCount, double exponentialBackOffExponent); /// @@ -221,7 +312,7 @@ public interface IPolicyBuilderBase /// Maximum number of retries. /// Wait policy. /// Wait policy. - /// + /// OnRetry hander IPolicyBuilderWait WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func, TimeSpan> waitDurationProvider); /// @@ -229,7 +320,7 @@ public interface IPolicyBuilderBase /// /// Maximum number of retries. /// Wait policy. - /// Wait policy. + /// OnRetry hander IPolicyBuilderWait WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func waitDurationProvider); /// @@ -239,15 +330,15 @@ public interface IPolicyBuilderBase /// Maximum number of retries. /// Value in seconds which is raised by the power of the backup retry attempt. /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. - /// Wait policy. - IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, int retryCount, int exponentialBackOffExponent, bool backupSkipProvidedIncrements); + /// OnRetry hander + IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, int retryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements); /// /// Custom optional exception-based wait policy backed up by the default wait policy. /// /// Wait policy that can return to use the backup wait policy. /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. - /// Wait policy. + /// OnRetry hander IPolicyBuilderWait WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func waitDurationProvider, bool backupSkipProvidedIncrements); } @@ -259,7 +350,7 @@ public interface IPolicyBuilderWait /// Logger to enable retry logging. /// Custom onretry handler. /// Custom async onretry handler. Only applies to . - /// OnRetry hander + /// OnRetry builder IPolicyBuilderBuild SetOnRetryBehavior(ILogger logger = default, OnRetryHandler onRetry = default, OnRetryHandlerAsync onRetryAsync = default); } @@ -271,7 +362,7 @@ public interface IPolicyBuilderWait /// Logger to enable retry logging. /// Custom onretry handler. /// Custom async onretry handler. Only applies to . - /// OnRetry hander + /// OnRetry builder IPolicyBuilderBuild SetOnRetryBehavior(ILogger logger = default, OnRetryHandler onRetry = default, OnRetryHandlerAsync onRetryAsync = default); } @@ -281,13 +372,25 @@ public interface IPolicyBuilderBuild /// Builds . /// /// Retry policy. - ISyncPolicy Build(); + RetryHandlerPolicy Build(); + + /// + /// Builds for extensions to the builder. + /// + /// Retry policy. + ISyncPolicy BuildPolicy(); /// /// Builds . /// /// Async retry policy. - IAsyncPolicy BuildAsync(); + AsyncRetryHandlerPolicy BuildAsync(); + + /// + /// Builds for extensions to the builder. + /// + /// Async retry policy. + IAsyncPolicy BuildAsyncPolicy(); /// /// Retrives the instance of the retryhandler to accomodate extensions to the builder @@ -297,17 +400,29 @@ public interface IPolicyBuilderBuild public interface IPolicyBuilderBuild { + ///// + ///// Builds . + ///// + ///// Retry policy. + //RetryHandlerPolicy Build(); + + ///// + ///// Builds for extensions to the builder. + ///// + ///// Retry policy. + //ISyncPolicy BuildPolicy(); + /// - /// Builds . + /// Builds . /// - /// Retry policy. - ISyncPolicy Build(); + /// Async retry policy. + AsyncRetryHandlerPolicy BuildAsync(); /// - /// Builds . + /// Builds for extensions to the builder. /// /// Async retry policy. - IAsyncPolicy BuildAsync(); + IAsyncPolicy BuildAsyncPolicy(); /// /// Retrives the instance of the retryhandler to accomodate extensions to the builder @@ -421,7 +536,7 @@ public static Func DefaultSleepDurationProvid public static Func ExponentialSleepDurationProvider(double exponentialBackOffExponent) => (attempt, _1, _2) => TimeSpan.FromSeconds(Math.Pow(exponentialBackOffExponent, attempt)); - public static Func ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(Func sleepDurationProvider, int exponentialBackOffExponent, bool backupSkipProvidedIncrements) + public static Func ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(Func sleepDurationProvider, double exponentialBackOffExponent, bool backupSkipProvidedIncrements) => (attempt, exception, ctx) => { return backupSkipProvidedIncrements @@ -467,7 +582,7 @@ IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithRetryPolicyOptio => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, Defaults.PolicyOptions.ExponentialBackOffExponent, backupSkipProvidedIncrements)); /// - IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithExponentialBackoffBackup(Func sleepDurationProvider, int retryCount, int exponentialBackOffExponent, bool backupSkipProvidedIncrements) + IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithExponentialBackoffBackup(Func sleepDurationProvider, int retryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements) => new PolicyBuilderWait(this, retryCount, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, exponentialBackOffExponent, backupSkipProvidedIncrements)); } @@ -505,7 +620,7 @@ IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWi => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, default, PolicyBuilderBase.ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, Defaults.PolicyOptions.ExponentialBackOffExponent, backupSkipProvidedIncrements)); /// - IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithExponentialBackoffBackup(Func sleepDurationProvider, int retryCount, int exponentialBackOffExponent, bool backupSkipProvidedIncrements) + IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithExponentialBackoffBackup(Func sleepDurationProvider, int retryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements) => new PolicyBuilderWait(this, retryCount, default, PolicyBuilderBase.ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, exponentialBackOffExponent, backupSkipProvidedIncrements)); } @@ -542,7 +657,7 @@ internal PolicyBuilderWait(PolicyBuilderBase policyBuilder, int retryCo { if (sleepDurationProviderException is null && sleepDurationProviderResult is null) { - throw new ArgumentNullException(); + throw new ArgumentNullException(null, $"At least one of {nameof(sleepDurationProviderResult)} or {nameof(sleepDurationProviderException)} must be provided."); } this.policyBuilder = policyBuilder; @@ -612,7 +727,7 @@ public static Func OnRetryHandlerAsync( } /// - ISyncPolicy IPolicyBuilderBuild.Build() + ISyncPolicy IPolicyBuilderBuild.BuildPolicy() { var waitProvider = sleepDurationProvider; var onRetryProvider = OnRetryHandler(logger, onRetryHandler); @@ -621,13 +736,21 @@ ISyncPolicy IPolicyBuilderBuild.Build() } /// - IAsyncPolicy IPolicyBuilderBuild.BuildAsync() + IAsyncPolicy IPolicyBuilderBuild.BuildAsyncPolicy() { var waitProvider = sleepDurationProvider; var onRetryProvider = OnRetryHandlerAsync(logger, onRetryHandler, onRetryHandlerAsync); return policyBuilder.policyBuilder.policyBuilder.WaitAndRetryAsync(policyBuilder.retryCount, waitProvider, onRetryProvider); } + + /// + RetryHandlerPolicy IPolicyBuilderBuild.Build() + => new(((IPolicyBuilderBuild)this).BuildPolicy()); + + /// + AsyncRetryHandlerPolicy IPolicyBuilderBuild.BuildAsync() + => new(((IPolicyBuilderBuild)this).BuildAsyncPolicy()); } private readonly struct PolicyBuilderBuild : IPolicyBuilderBuild @@ -667,16 +790,16 @@ private static Action, TimeSpan, int, Context> Logger(IL }; } - private static Action, TimeSpan, int, Context> OnRetryHandler(ILogger logger, OnRetryHandler onRetryHandler) - { - var genericHandler = onRetryHandler ?? new((outcome, timeSpan, retryCount, correlationId, caller) => { }); + //private static Action, TimeSpan, int, Context> OnRetryHandler(ILogger logger, OnRetryHandler onRetryHandler) + //{ + // var genericHandler = onRetryHandler ?? new((outcome, timeSpan, retryCount, correlationId, caller) => { }); - return (outcome, timeSpan, retryCount, ctx) => - { - genericHandler(outcome, timeSpan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); - Logger(logger)(outcome, timeSpan, retryCount, ctx); - }; - } + // return (outcome, timeSpan, retryCount, ctx) => + // { + // genericHandler(outcome, timeSpan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); + // Logger(logger)(outcome, timeSpan, retryCount, ctx); + // }; + //} private static Func, TimeSpan, int, Context, Task> OnRetryHandlerAsync(ILogger logger, OnRetryHandler onRetryHandler, OnRetryHandlerAsync onRetryHandlerAsync) { @@ -694,24 +817,37 @@ private static Func, TimeSpan, int, Context, Task> OnRet }; } - /// - ISyncPolicy IPolicyBuilderBuild.Build() - { - var waitProvider = sleepDurationProvider; - var onRetryProvider = OnRetryHandler(logger, onRetryHandler); + ///// + //ISyncPolicy IPolicyBuilderBuild.BuildPolicy() + //{ + // var waitProvider = sleepDurationProvider; + // var onRetryProvider = OnRetryHandler(logger, onRetryHandler); - return policyBuilder.policyBuilder.policyBuilder.WaitAndRetry(policyBuilder.retryCount, waitProvider, onRetryProvider); - } + // return policyBuilder.policyBuilder.policyBuilder.WaitAndRetry(policyBuilder.retryCount, waitProvider, onRetryProvider); + //} /// - IAsyncPolicy IPolicyBuilderBuild.BuildAsync() + IAsyncPolicy IPolicyBuilderBuild.BuildAsyncPolicy() { var waitProvider = sleepDurationProvider; var onRetryProvider = OnRetryHandlerAsync(logger, onRetryHandler, onRetryHandlerAsync); return policyBuilder.policyBuilder.policyBuilder.WaitAndRetryAsync(policyBuilder.retryCount, waitProvider, onRetryProvider); } + + ///// + //RetryHandlerPolicy IPolicyBuilderBuild.Build() + // => new(((IPolicyBuilderBuild)this).BuildPolicy()); + + /// + AsyncRetryHandlerPolicy IPolicyBuilderBuild.BuildAsync() + => new(((IPolicyBuilderBuild)this).BuildAsyncPolicy()); } - #endregion } + #endregion + + public static Context PrepareContext(string caller) => new() + { + [CallerMemberNameKey] = caller + }; } diff --git a/src/Tes.Runner/Docker/DockerExecutor.cs b/src/Tes.Runner/Docker/DockerExecutor.cs index 4ccc126bd..00f14e654 100644 --- a/src/Tes.Runner/Docker/DockerExecutor.cs +++ b/src/Tes.Runner/Docker/DockerExecutor.cs @@ -17,7 +17,7 @@ public class DockerExecutor private readonly IDockerClient dockerClient = null!; private readonly ILogger logger = PipelineLoggerFactory.Create(); private readonly NetworkUtility networkUtility = new(); - private readonly Polly.IAsyncPolicy asyncRetryPolicy = + private readonly RetryHandler.AsyncRetryHandlerPolicy asyncRetryPolicy = new RetryHandler(Options.Create(new RetryPolicyOptions())) .RetryDefaultPolicyBuilder().SetOnRetryBehavior().BuildAsync(); private readonly IStreamLogReader streamLogReader = null!; @@ -129,13 +129,13 @@ private async Task PullImageWithRetriesAsync(string imageName, string? tag, Auth { logger.LogInformation($"Pulling image name: {imageName} image tag: {tag}"); - await asyncRetryPolicy.ExecuteAsync(async () => + await asyncRetryPolicy.ExecuteWithRetryAsync(async _ => { await dockerClient.Images.CreateImageAsync( new ImagesCreateParameters() { FromImage = imageName, Tag = tag }, authConfig, new Progress(message => logger.LogDebug(message.Status))); - }); + }, CancellationToken.None); } private async Task DeleteAllImagesAsync() diff --git a/src/TesApi.Tests/ContainerRegistryProviderTests.cs b/src/TesApi.Tests/ContainerRegistryProviderTests.cs index 82e84e5a3..c59c241f2 100644 --- a/src/TesApi.Tests/ContainerRegistryProviderTests.cs +++ b/src/TesApi.Tests/ContainerRegistryProviderTests.cs @@ -22,7 +22,7 @@ public class ContainerRegistryProviderTests { private ContainerRegistryProvider containerRegistryProvider; private ContainerRegistryOptions containerRegistryOptions; - private Lazy> asyncRetryPolicy = null!; + private Lazy> asyncRetryPolicy = null!; private Mock appCacheMock; private Mock> containerRegistryOptionsMock; private Mock> loggerMock; @@ -52,7 +52,7 @@ public async Task GetContainerRegistryInfoAsync_ServerIsAccessible_ReturnsAndAdd var server = "registry.com"; var image = $"{server}/image"; asyncRetryPolicy.Value.Setup(r => - r.ExecuteAsync(It.IsAny>>>(), It.IsAny(), It.IsAny())) + r.ExecuteWithRetryAsync(It.IsAny>>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(new List() { new() { RegistryServer = server } @@ -82,7 +82,7 @@ public async Task GetContainerRegistryInfoAsync_ServerInCache_ReturnsRegistryInf Assert.AreEqual(server, container.RegistryServer); appCacheMock.Verify(c => c.TryGetValue(It.Is(v => $"{nameof(ContainerRegistryProvider)}:{image}".Equals(v)), out It.Ref.IsAny), Times.Once()); asyncRetryPolicy.Value.Verify(r => - r.ExecuteAsync(It.IsAny>>>(), It.IsAny(), It.IsAny()), Times.Never); + r.ExecuteWithRetryAsync(It.IsAny>>>(), It.IsAny(), It.IsAny()), Times.Never); } [TestMethod] @@ -102,9 +102,9 @@ public async Task GetContainerRegistryInfoAsync_NoAccessibleServerNoServerCached var server = "registry"; var image = $"{server}_other/image"; asyncRetryPolicy.Value.Setup(r => - r.ExecuteAsync( - It.IsAny>>>(), - It.IsAny(), It.IsAny())) + r.ExecuteWithRetryAsync( + It.IsAny>>>(), + It.IsAny(), It.IsAny())) .ReturnsAsync(new List() { new() { RegistryServer = server } diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index b3e500d34..aae9401f7 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -47,8 +47,8 @@ namespace TesApi.Web public partial class AzureProxy : IAzureProxy { private const char BatchJobAttemptSeparator = '-'; - private readonly IAsyncPolicy batchRetryPolicyWhenJobNotFound; - private readonly IAsyncPolicy batchRetryPolicyWhenNodeNotReady; + private readonly RetryHandler.AsyncRetryHandlerPolicy batchRetryPolicyWhenJobNotFound; + private readonly RetryHandler.AsyncRetryHandlerPolicy batchRetryPolicyWhenNodeNotReady; private readonly ILogger logger; private readonly BatchClient batchClient; @@ -104,7 +104,6 @@ public AzureProxy(IOptions batchAccountOptions, IBatchPoolM location = batchAccountOptions.Value.Region; //subscriptionId = batchAccountOptions.Value.SubscriptionId; //batchResourceGroupName = batchAccountOptions.Value.ResourceGroup; - } else { @@ -272,7 +271,7 @@ public async Task CreateAutoPoolModeBatchJobAsync(string jobId, CloudTask cloudT try { logger.LogInformation($"TES task: {cloudTask.Id} adding task to job."); - job = await batchRetryPolicyWhenJobNotFound.ExecuteAsync(ct => + job = await batchRetryPolicyWhenJobNotFound.ExecuteWithRetryAsync(ct => batchClient.JobOperations.GetJobAsync(job.Id, cancellationToken: ct), cancellationToken); @@ -318,7 +317,7 @@ public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, PoolI ArgumentException.ThrowIfNullOrEmpty(poolInformation?.PoolId, nameof(poolInformation)); logger.LogInformation("TES task: {TesTask} - Adding task to job {BatchJob}", tesTaskId, poolInformation.PoolId); - var job = await batchRetryPolicyWhenJobNotFound.ExecuteAsync(ct => + var job = await batchRetryPolicyWhenJobNotFound.ExecuteWithRetryAsync(ct => batchClient.JobOperations.GetJobAsync(poolInformation.PoolId, cancellationToken: ct), cancellationToken); @@ -564,7 +563,7 @@ public async Task DeleteBatchTaskAsync(string tesTaskId, PoolInformation pool, C foreach (var task in batchTasksToDelete) { logger.LogInformation("Deleting task {BatchTask}", task.Id); - await batchRetryPolicyWhenNodeNotReady.ExecuteAsync(ct => task.DeleteAsync(cancellationToken: ct), cancellationToken); + await batchRetryPolicyWhenNodeNotReady.ExecuteWithRetryAsync(ct => task.DeleteAsync(cancellationToken: ct), cancellationToken); } } diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index beca43672..184cb311c 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -25,10 +25,10 @@ public class CachingWithRetriesAzureProxy : IAzureProxy { private readonly ILogger logger; private readonly IAzureProxy azureProxy; - private readonly CachingRetryHandler.ICachingSyncPolicy cachingRetry; - private readonly CachingRetryHandler.ICachingAsyncPolicy cachingAsyncRetry; - private readonly CachingRetryHandler.ICachingAsyncPolicy cachingAsyncRetryExceptWhenExists; - private readonly CachingRetryHandler.ICachingAsyncPolicy cachingAsyncRetryExceptWhenNotFound; + private readonly CachingRetryHandler.CachingRetryHandlerPolicy cachingRetry; + private readonly CachingRetryHandler.CachingAsyncRetryHandlerPolicy cachingAsyncRetry; + private readonly CachingRetryHandler.CachingAsyncRetryHandlerPolicy cachingAsyncRetryExceptWhenExists; + private readonly CachingRetryHandler.CachingAsyncRetryHandlerPolicy cachingAsyncRetryExceptWhenNotFound; /// /// Contructor to create a cache of @@ -345,13 +345,13 @@ public Task GetFullAllocationStateAsync(string poo /// public IAsyncEnumerable ListComputeNodesAsync(string poolId, DetailLevel detailLevel) { - return cachingAsyncRetry.ExecuteAsync(() => azureProxy.ListComputeNodesAsync(poolId, detailLevel), cachingRetry); + return cachingAsyncRetry.ExecuteWithRetryAsync(() => azureProxy.ListComputeNodesAsync(poolId, detailLevel), cachingRetry); } /// public IAsyncEnumerable ListTasksAsync(string jobId, DetailLevel detailLevel) { - return cachingAsyncRetry.ExecuteAsync(() => azureProxy.ListTasksAsync(jobId, detailLevel), cachingRetry); + return cachingAsyncRetry.ExecuteWithRetryAsync(() => azureProxy.ListTasksAsync(jobId, detailLevel), cachingRetry); } /// diff --git a/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs b/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs index 0847da7b9..143780d0b 100644 --- a/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs +++ b/src/TesApi.Web/Extensions/PagedInterfaceExtensions.cs @@ -60,14 +60,14 @@ public static IAsyncEnumerable ToAsyncEnumerable(this IPage source, Fun /// Policy retrying call to . /// Name of method originating the retriable operation. /// - public static IAsyncEnumerable ExecuteAsync(this Polly.IAsyncPolicy asyncRetryPolicy, Func> func, Polly.ISyncPolicy retryPolicy, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + public static IAsyncEnumerable ExecuteWithRetryAsync(this RetryHandler.AsyncRetryHandlerPolicy asyncRetryPolicy, Func> func, RetryHandler.RetryHandlerPolicy retryPolicy, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) // ExecuteWithRetryAsync { ArgumentNullException.ThrowIfNull(asyncRetryPolicy); ArgumentNullException.ThrowIfNull(func); ArgumentNullException.ThrowIfNull(retryPolicy); - var ctx = new Polly.Context { [RetryHandler.CallerMemberNameKey] = caller }; - return new PollyAsyncEnumerable(retryPolicy.Execute(_ => func(), ctx), asyncRetryPolicy, ctx); + var ctx = RetryHandler.PrepareContext(caller); + return new PollyAsyncEnumerable(retryPolicy.RetryPolicy.Execute(_ => func(), ctx), asyncRetryPolicy, ctx); } #region Implementation classes @@ -103,10 +103,10 @@ public IAsyncEnumerator GetAsyncEnumerator(CancellationToken cancellationToke private sealed class PollyAsyncEnumerable : IAsyncEnumerable { private readonly IAsyncEnumerable _source; - private readonly Polly.IAsyncPolicy _retryPolicy; + private readonly RetryHandler.AsyncRetryHandlerPolicy _retryPolicy; private readonly Polly.Context _ctx; - public PollyAsyncEnumerable(IAsyncEnumerable source, Polly.IAsyncPolicy retryPolicy, Polly.Context ctx) + public PollyAsyncEnumerable(IAsyncEnumerable source, RetryHandler.AsyncRetryHandlerPolicy retryPolicy, Polly.Context ctx) { ArgumentNullException.ThrowIfNull(source); ArgumentNullException.ThrowIfNull(retryPolicy); @@ -124,11 +124,11 @@ IAsyncEnumerator IAsyncEnumerable.GetAsyncEnumerator(CancellationToken can private sealed class PollyAsyncEnumerator : IAsyncEnumerator { private readonly IAsyncEnumerator _source; - private readonly Polly.IAsyncPolicy _retryPolicy; + private readonly RetryHandler.AsyncRetryHandlerPolicy _retryPolicy; private readonly CancellationToken _cancellationToken; private readonly Polly.Context _ctx; - public PollyAsyncEnumerator(IAsyncEnumerator source, Polly.IAsyncPolicy retryPolicy, Polly.Context ctx, CancellationToken cancellationToken) + public PollyAsyncEnumerator(IAsyncEnumerator source, RetryHandler.AsyncRetryHandlerPolicy retryPolicy, Polly.Context ctx, CancellationToken cancellationToken) { ArgumentNullException.ThrowIfNull(source); ArgumentNullException.ThrowIfNull(retryPolicy); @@ -147,7 +147,7 @@ ValueTask IAsyncDisposable.DisposeAsync() => _source.DisposeAsync(); ValueTask IAsyncEnumerator.MoveNextAsync() - => new(_retryPolicy.ExecuteAsync((_, ct) => _source.MoveNextAsync(ct).AsTask(), new(_ctx.OperationKey, _ctx), _cancellationToken)); + => new(_retryPolicy.RetryPolicy.ExecuteAsync((_, ct) => _source.MoveNextAsync(ct).AsTask(), new(_ctx.OperationKey, _ctx), _cancellationToken)); } private sealed class PageEnumerator : EnumeratorEnumerator> diff --git a/src/TesApi.Web/Management/AzureProvider.cs b/src/TesApi.Web/Management/AzureProvider.cs index 536d372a5..7e9632ec5 100644 --- a/src/TesApi.Web/Management/AzureProvider.cs +++ b/src/TesApi.Web/Management/AzureProvider.cs @@ -12,7 +12,7 @@ namespace TesApi.Web.Management /// public abstract class AzureProvider { - private protected readonly CachingRetryHandler.ICachingAsyncPolicy CachingAsyncRetryPolicy; + private protected readonly CachingRetryHandler.CachingAsyncRetryHandlerPolicy CachingAsyncRetryPolicy; private protected readonly AzureManagementClientsFactory ManagementClientsFactory; private protected readonly ILogger Logger; From cd0b275e4f894ca7f45ceffd5359c72c544e6bb2 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 30 Nov 2023 18:47:44 -0800 Subject: [PATCH 103/202] format and refactor for clarity --- .../CacheAndRetryHandlerTest.cs | 11 +- .../TerraWsmApiClientTests.cs | 6 +- .../TestServices/RetryHandlersHelpers.cs | 6 +- src/Tes.ApiClients/CachingRetryHandler.cs | 24 +- src/Tes.ApiClients/HttpApiClient.cs | 4 +- src/Tes.ApiClients/RetryHandler.cs | 234 +++++++++++------- src/Tes.ApiClients/Tes.ApiClients.csproj | 4 - src/Tes.Runner/Docker/DockerExecutor.cs | 13 +- src/TesApi.Web/AzureProxy.cs | 4 +- .../CachingWithRetriesAzureProxy.cs | 2 +- src/TesApi.Web/Management/AzureProvider.cs | 2 +- 11 files changed, 185 insertions(+), 125 deletions(-) diff --git a/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs b/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs index a5f6a61f1..7a0a2697f 100644 --- a/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs +++ b/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs @@ -28,13 +28,13 @@ public void SetUp() var cachingRetryHandler = new CachingRetryHandler(appCache, mockOptions.Object); cachingAsyncHttpResponseMessagePolicy = cachingRetryHandler - .RetryDefaultHttpResponseMessagePolicyBuilder() + .DefaultRetryHttpResponseMessagePolicyBuilder() .SetOnRetryBehavior() //.AddCaching() - .BuildAsync(); + .AsyncBuild(); cachingAsyncPolicy = cachingRetryHandler - .RetryDefaultPolicyBuilder() + .DefaultRetryPolicyBuilder() .SetOnRetryBehavior() .AddCaching() .BuildAsync(); @@ -105,9 +105,8 @@ public async Task ExecuteHttpRequestWithRetryAsync_RetriesOnlyOnExpectedFailureC mockFactory.Setup(f => f.CreateResponseAsync()).Returns(CreateResponseAsync(statusCode)); var response = - await cachingAsyncHttpResponseMessagePolicy.ExecuteWithRetryAsync(_ => - mockFactory.Object.CreateResponseAsync(), - CancellationToken.None); + await cachingAsyncHttpResponseMessagePolicy.ExecuteWithRetryAsync( + mockFactory.Object.CreateResponseAsync); mockFactory.Verify(f => f.CreateResponseAsync(), Times.Exactly(numberOfTimes)); Assert.AreEqual(response.StatusCode, statusCode); diff --git a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs index ba16f045f..880dbdd50 100644 --- a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs +++ b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs @@ -29,7 +29,7 @@ public void SetUp() var cache = new Mock(); cache.Setup(c => c.CreateEntry(It.IsAny())).Returns(new Mock().Object); cacheAndRetryHandler.SetupGet(c => c.AppCache).Returns(cache.Object); - asyncResponseRetryPolicy = new(TestServices.RetryHandlersHelpers.GetCachingAsyncRetryPolicyMock(cacheAndRetryHandler, c => c.RetryDefaultHttpResponseMessagePolicyBuilder())); + asyncResponseRetryPolicy = new(TestServices.RetryHandlersHelpers.GetCachingAsyncRetryPolicyMock(cacheAndRetryHandler, c => c.DefaultRetryHttpResponseMessagePolicyBuilder())); terraWsmApiClient = new TerraWsmApiClient(TerraApiStubData.WsmApiHost, tokenCredential.Object, cacheAndRetryHandler.Object, NullLogger.Instance); } @@ -155,7 +155,7 @@ public async Task GetResourceQuotaAsync_ValidResourceIdReturnsQuotaInformationAn { var body = terraApiStubData.GetResourceQuotaApiResponseInJson(); - asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteWithRetryAndCachingAsync(It.IsAny(), + asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteWithRetryConversionAndCachingAsync(It.IsAny(), It.IsAny>>(), It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(body); @@ -182,7 +182,7 @@ public async Task GetLandingZoneResourcesAsync_ListOfLandingZoneResourcesAndGets { var body = terraApiStubData.GetResourceApiResponseInJson(); - asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteWithRetryAndCachingAsync(It.IsAny(), + asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteWithRetryConversionAndCachingAsync(It.IsAny(), It.IsAny>>(), It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(body); diff --git a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs index e24b25726..7b72f250a 100644 --- a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs +++ b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs @@ -28,13 +28,13 @@ internal static Mock GetCachingAsyncRetryPolicyM var builderBuild = new Mock(); builderBuild.Setup(policy => policy.PolicyBuilderBase) .Returns(cachingRetryHandler.Object); - builderBuild.Setup(c => c.BuildAsyncPolicy()) + builderBuild.Setup(c => c.AsyncBuildPolicy()) .Returns((IAsyncPolicy)cachingAsyncRetryPolicy.Object); var builderWait = new Mock(); builderWait.Setup(c => c.SetOnRetryBehavior(It.IsAny(), It.IsAny(), It.IsAny())) .Returns(builderBuild.Object); - cachingRetryHandler.Setup(c => c.RetryDefaultPolicyBuilder()) + cachingRetryHandler.Setup(c => c.DefaultRetryPolicyBuilder()) .Returns(builderWait.Object); cachingAsyncRetryPolicy.Setup(c => c.AppCache) @@ -56,7 +56,7 @@ internal static Mock> GetCachingAsyncRet var builderBuild = new Mock>(); builderBuild.Setup(policy => policy.PolicyBuilderBase) .Returns(cachingRetryHandler.Object); - builderBuild.Setup(c => c.BuildAsyncPolicy()) + builderBuild.Setup(c => c.AsyncBuildPolicy()) .Returns((IAsyncPolicy)cachingAsyncRetryPolicy.Object); var builderWait = new Mock>(); builderWait.Setup(c => c.SetOnRetryBehavior(It.IsAny(), It.IsAny>(), It.IsAny>())) diff --git a/src/Tes.ApiClients/CachingRetryHandler.cs b/src/Tes.ApiClients/CachingRetryHandler.cs index 62512d65d..8090ddcc0 100644 --- a/src/Tes.ApiClients/CachingRetryHandler.cs +++ b/src/Tes.ApiClients/CachingRetryHandler.cs @@ -101,7 +101,7 @@ public virtual async Task ExecuteWithRetryAndCachingAsync(stri } /// - /// Executes a delegate with the specified async retry policy and persisting the result in a cache. + /// Executes a delegate with the specified async retry policy and persisting the result in a cache. /// /// /// Action to execute @@ -168,7 +168,7 @@ public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheK } /// - /// Executes a delegate with the specified async retry policy and persisting the result in a cache. + /// Executes a delegate with the specified async retry policy and persisting the result in a cache. /// /// /// Action to execute @@ -185,24 +185,24 @@ public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheK /// /// Executes a delegate with the specified async retry policy and persisting the result in a cache. /// - /// Final return type + /// Instance type in cache /// /// Action to execute /// Method to convert /// A for controlling the lifetime of the asynchronous operation. /// Name of method originating the retriable operation. /// - public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, Func> convert, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + public virtual async Task ExecuteWithRetryConversionAndCachingAsync(string cacheKey, Func> action, Func> convert, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) { ValidateArgs(cacheKey, action); - return await ExecuteWithCacheAsync(appCache, cacheKey, async () => await convert(await ExecuteWithRetryAsync(action, cancellationToken, caller), cancellationToken)); + return await ExecuteWithCacheAsync(appCache, cacheKey, () => ExecuteWithRetryAndConversionAsync(action, convert, cancellationToken, caller)); } /// - /// Executes a delegate with the specified async retry policy and persisting the result in a cache. + /// Executes a delegate with the specified async retry policy and persisting the result in a cache. /// - /// Final return type + /// Instance type in cache /// /// Action to execute /// Method to convert @@ -210,11 +210,11 @@ public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, /// A for controlling the lifetime of the asynchronous operation. /// Name of method originating the retriable operation. /// - public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, Func> convert, DateTimeOffset cachesExpires, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + public virtual async Task ExecuteWithRetryConversionAndCachingAsync(string cacheKey, Func> action, Func> convert, DateTimeOffset cachesExpires, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) { ValidateArgs(cacheKey, action); - return await ExecuteWithCacheAsync(appCache, cacheKey, async () => await convert(await ExecuteWithRetryAsync(action, cancellationToken, caller), cancellationToken), cachesExpires); + return await ExecuteWithCacheAsync(appCache, cacheKey, () => ExecuteWithRetryAndConversionAsync(action, convert, cancellationToken, caller), cachesExpires); } } #endregion @@ -286,12 +286,12 @@ public CachingPolicyBuilderBuild(IPolicyBuilderBuild policyBuilder, CachingRetry CachingRetryHandlerPolicy ICachingPolicyBuilderBuild.Build() { - return new(policyBuilder.BuildPolicy(), cachingHandler.AppCache); + return new(policyBuilder.SyncBuildPolicy(), cachingHandler.AppCache); } CachingAsyncRetryHandlerPolicy ICachingPolicyBuilderBuild.BuildAsync() { - return new(policyBuilder.BuildAsyncPolicy(), cachingHandler.AppCache); + return new(policyBuilder.AsyncBuildPolicy(), cachingHandler.AppCache); } } @@ -314,7 +314,7 @@ public CachingPolicyBuilderBuild(IPolicyBuilderBuild policyBuilder, Cac CachingAsyncRetryHandlerPolicy ICachingPolicyBuilderBuild.BuildAsync() { - return new(policyBuilder.BuildAsyncPolicy(), cachingHandler.AppCache); + return new(policyBuilder.AsyncBuildPolicy(), cachingHandler.AppCache); } } #endregion diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index c320657b7..680f50f7d 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -46,7 +46,7 @@ protected HttpApiClient(CachingRetryHandler cachingRetryHandler, ILogger logger) this.Logger = logger; AsyncCachingHttpResponseMessageRetryPolicy = cachingRetryHandler - .RetryDefaultHttpResponseMessagePolicyBuilder() + .DefaultRetryHttpResponseMessagePolicyBuilder() .SetOnRetryBehavior(onRetry: LogRetryErrorOnRetryHttpResponseMessageHandler()) .AddCaching() .BuildAsync(); @@ -163,7 +163,7 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri re { var cacheKey = await ToCacheKeyAsync(requestUrl, setAuthorizationHeader, cancellationToken); - return (await AsyncCachingHttpResponseMessageRetryPolicy.ExecuteWithRetryAndCachingAsync(cacheKey, + return (await AsyncCachingHttpResponseMessageRetryPolicy.ExecuteWithRetryConversionAndCachingAsync(cacheKey, async ct => { //request must be recreated in every retry. diff --git a/src/Tes.ApiClients/RetryHandler.cs b/src/Tes.ApiClients/RetryHandler.cs index 78ce1fa30..39807d3f9 100644 --- a/src/Tes.ApiClients/RetryHandler.cs +++ b/src/Tes.ApiClients/RetryHandler.cs @@ -15,8 +15,13 @@ namespace Tes.ApiClients; /// public class RetryHandler { - // Keys used by this implementation in Polly Contexts + /// + /// Polly Context key for caller method name + /// public const string CallerMemberNameKey = $"Tes.ApiClients.{nameof(RetryHandler)}.CallerMemberName"; + /// + /// Polly Context key for backup skip increment setting + /// public const string BackupSkipProvidedIncrementKey = $"Tes.ApiClients.{nameof(RetryHandler)}.BackupSkipProvidedIncrementCount"; /// @@ -31,14 +36,14 @@ public class RetryHandler Polly.Extensions.Http.HttpPolicyExtensions.HandleTransientHttpError() .OrResult(r => r.StatusCode == HttpStatusCode.TooManyRequests); - /// Shortcut starting point for testing. - public virtual IPolicyBuilderWait RetryDefaultPolicyBuilder() + /// Shortcut starting point for testing. Can be used in production as well. + public virtual IPolicyBuilderWait DefaultRetryPolicyBuilder() => PolicyBuilder .OpinionatedRetryPolicy() .WithRetryPolicyOptionsWait(); - /// Shortcut starting point for testing. - public virtual IPolicyBuilderWait RetryDefaultHttpResponseMessagePolicyBuilder() + /// Shortcut starting point for testing. Can be used in production as well. + public virtual IPolicyBuilderWait DefaultRetryHttpResponseMessagePolicyBuilder() => PolicyBuilder .OpinionatedRetryPolicy(DefaultHttpResponseMessagePolicyBuilder) .WithRetryPolicyOptionsWait(); @@ -66,6 +71,7 @@ public class RetryHandlerPolicy { private readonly ISyncPolicy retryPolicy; + /// For extensions public ISyncPolicy RetryPolicy => retryPolicy; /// @@ -82,6 +88,18 @@ public RetryHandlerPolicy(ISyncPolicy retryPolicy) public RetryHandlerPolicy() { } + /// + /// Executes a delegate with the configured policy. + /// + /// Action to execute. + /// Name of method originating the retriable operation. + public virtual void ExecuteWithRetry(Action action, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + ArgumentNullException.ThrowIfNull(action); + + retryPolicy.Execute(_ => action(), PrepareContext(caller)); + } + /// /// Executes a delegate with the configured policy. /// @@ -93,44 +111,15 @@ public virtual TResult ExecuteWithRetry(Func action, [System.R { ArgumentNullException.ThrowIfNull(action); - return retryPolicy.Execute(_ => action(), RetryHandler.PrepareContext(caller)); + return retryPolicy.Execute(_ => action(), PrepareContext(caller)); } } + // TODO: if/when needed ///// ///// Generic synchronous retry policy ///// //public class RetryHandlerPolicy - //{ - // private readonly ISyncPolicy retryPolicy; - - // /// - // /// Public constructor - // /// - // /// Synchronous retry policy. - // public RetryHandlerPolicy(ISyncPolicy retryPolicy) - // { - // ArgumentNullException.ThrowIfNull(retryPolicy); - // this.retryPolicy = retryPolicy; - // } - - // /// For mocking - // public RetryHandlerPolicy() { } - - - // /// - // /// Executes a delegate with the configured policy. - // /// - // /// Action to execute. - // /// Name of method originating the retriable operation. - // /// - // public virtual TResult ExecuteWithRetry(Func action, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) - // { - // ArgumentNullException.ThrowIfNull(action); - - // return retryPolicy.Execute(_ => action(), PrepareContext(caller)); - // } - //} /// /// Non-generic asynchronous retry policy @@ -139,6 +128,7 @@ public class AsyncRetryHandlerPolicy { private readonly IAsyncPolicy retryPolicy; + /// For extensions public IAsyncPolicy RetryPolicy => retryPolicy; /// @@ -158,16 +148,14 @@ public AsyncRetryHandlerPolicy() { } /// /// Executes a delegate with the configured async policy. /// - /// Result type. /// Action to execute. - /// A for controlling the lifetime of the asynchronous operation. /// Name of method originating the retriable operation. - /// instance. - public virtual Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + /// + public virtual Task ExecuteWithRetryAsync(Func action, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) { ArgumentNullException.ThrowIfNull(action); - return retryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(caller), cancellationToken); + return retryPolicy.ExecuteAsync((_, _) => action(), PrepareContext(caller), CancellationToken.None); } /// @@ -183,6 +171,35 @@ public virtual Task ExecuteWithRetryAsync(Func action, return retryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(caller), cancellationToken); } + + /// + /// Executes a delegate with the configured async policy. + /// + /// Result type. + /// Action to execute. + /// Name of method originating the retriable operation. + /// instance. + public virtual Task ExecuteWithRetryAsync(Func> action, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + ArgumentNullException.ThrowIfNull(action); + + return retryPolicy.ExecuteAsync((_, _) => action(), PrepareContext(caller), CancellationToken.None); + } + + /// + /// Executes a delegate with the configured async policy. + /// + /// Result type. + /// Action to execute. + /// A for controlling the lifetime of the asynchronous operation. + /// Name of method originating the retriable operation. + /// instance. + public virtual Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + ArgumentNullException.ThrowIfNull(action); + + return retryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(caller), cancellationToken); + } } /// @@ -192,6 +209,9 @@ public class AsyncRetryHandlerPolicy { private readonly IAsyncPolicy retryPolicy; + /// For extensions + public IAsyncPolicy RetryPolicy => retryPolicy; + /// /// Public constructor /// @@ -206,6 +226,19 @@ public AsyncRetryHandlerPolicy(IAsyncPolicy retryPolicy) public AsyncRetryHandlerPolicy() { } + /// + /// Executes a delegate with the configured async policy. + /// + /// Action to execute + /// Name of method originating the retriable operation. + /// + public virtual Task ExecuteWithRetryAsync(Func> action, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + ArgumentNullException.ThrowIfNull(action); + + return retryPolicy.ExecuteAsync((_, _) => action(), PrepareContext(caller), CancellationToken.None); + } + /// /// Executes a delegate with the configured async policy. /// @@ -219,10 +252,29 @@ public virtual Task ExecuteWithRetryAsync(Func action(ct), PrepareContext(caller), cancellationToken); } + + /// + /// Executes a delegate with the configured async policy. + /// + /// Return type + /// Action to execute + /// Method to convert + /// A for controlling the lifetime of the asynchronous operation. + /// Name of method originating the retriable operation. + /// + public virtual async Task ExecuteWithRetryAndConversionAsync(Func> action, Func> convert, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + { + ArgumentNullException.ThrowIfNull(action); + + return await convert(await retryPolicy.ExecuteAsync((_, ct) => action(ct), PrepareContext(caller), cancellationToken), cancellationToken); + } } #endregion #region Builder interfaces + /// + /// Selects retry policy type. + /// public interface IPolicyBuilderPolicy { /// @@ -247,6 +299,9 @@ public interface IPolicyBuilderPolicy IPolicyBuilderBase OpinionatedRetryPolicy(PolicyBuilder policyBuilder); } + /// + /// Selects retry policy wait algorithm. + /// public interface IPolicyBuilderBase { /// @@ -290,6 +345,9 @@ public interface IPolicyBuilderBase IPolicyBuilderWait WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func waitDurationProvider, bool backupSkipProvidedIncrements); } + /// + /// Selects retry policy wait algorithm. + /// public interface IPolicyBuilderBase { /// @@ -304,7 +362,7 @@ public interface IPolicyBuilderBase /// Maximum number of retries. /// Value in seconds which is raised by the power of the retry attempt. /// OnRetry hander - IPolicyBuilderWait WithExponentialBackoffWait(int retryCount, double exponentialBackOffExponent); + IPolicyBuilderWait WithExponentialBackoffWait(int maxRetryCount, double exponentialBackOffExponent); /// /// Custom result-based policy. @@ -342,6 +400,9 @@ public interface IPolicyBuilderBase IPolicyBuilderWait WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func waitDurationProvider, bool backupSkipProvidedIncrements); } + /// + /// Sets on-retry handlers. + /// public interface IPolicyBuilderWait { /// @@ -354,6 +415,9 @@ public interface IPolicyBuilderWait IPolicyBuilderBuild SetOnRetryBehavior(ILogger logger = default, OnRetryHandler onRetry = default, OnRetryHandlerAsync onRetryAsync = default); } + /// + /// Sets on-retry handlers. + /// public interface IPolicyBuilderWait { /// @@ -372,25 +436,25 @@ public interface IPolicyBuilderBuild /// Builds . /// /// Retry policy. - RetryHandlerPolicy Build(); + RetryHandlerPolicy SyncBuild(); /// /// Builds for extensions to the builder. /// /// Retry policy. - ISyncPolicy BuildPolicy(); + ISyncPolicy SyncBuildPolicy(); /// /// Builds . /// /// Async retry policy. - AsyncRetryHandlerPolicy BuildAsync(); + AsyncRetryHandlerPolicy AsyncBuild(); /// /// Builds for extensions to the builder. /// /// Async retry policy. - IAsyncPolicy BuildAsyncPolicy(); + IAsyncPolicy AsyncBuildPolicy(); /// /// Retrives the instance of the retryhandler to accomodate extensions to the builder @@ -404,25 +468,25 @@ public interface IPolicyBuilderBuild ///// Builds . ///// ///// Retry policy. - //RetryHandlerPolicy Build(); + //RetryHandlerPolicy SyncBuild(); ///// ///// Builds for extensions to the builder. ///// ///// Retry policy. - //ISyncPolicy BuildPolicy(); + //ISyncPolicy SyncBuildPolicy(); /// /// Builds . /// /// Async retry policy. - AsyncRetryHandlerPolicy BuildAsync(); + AsyncRetryHandlerPolicy AsyncBuild(); /// /// Builds for extensions to the builder. /// /// Async retry policy. - IAsyncPolicy BuildAsyncPolicy(); + IAsyncPolicy AsyncBuildPolicy(); /// /// Retrives the instance of the retryhandler to accomodate extensions to the builder @@ -487,7 +551,7 @@ private readonly struct Defaults public readonly RetryPolicyOptions PolicyOptions; public readonly RetryHandler PolicyBuilderBase; - internal Defaults(RetryHandler retryHandler, RetryPolicyOptions options) + public Defaults(RetryHandler retryHandler, RetryPolicyOptions options) { ArgumentNullException.ThrowIfNull(options); PolicyOptions = options; @@ -499,7 +563,7 @@ internal Defaults(RetryHandler retryHandler, RetryPolicyOptions options) { private readonly Defaults Defaults; - internal PolicyBuilderPolicy(Defaults options) + public PolicyBuilderPolicy(Defaults options) { ArgumentNullException.ThrowIfNull(options); Defaults = options; @@ -522,7 +586,7 @@ IPolicyBuilderBase IPolicyBuilderPolicy.OpinionatedRetryPolicy public readonly PolicyBuilder policyBuilder; public readonly Defaults Defaults; - internal PolicyBuilderBase(PolicyBuilder policyBuilder, Defaults defaults) + public PolicyBuilderBase(PolicyBuilder policyBuilder, Defaults defaults) { ArgumentNullException.ThrowIfNull(policyBuilder); ArgumentNullException.ThrowIfNull(defaults); @@ -545,7 +609,7 @@ public static Func ExceptionBasedSleepDuratio TimeSpan AdjustAttemptIfNeeded() { - if (!ctx.TryGetValue(BackupSkipProvidedIncrementKey, out var value) || value is not int) + if (!ctx.TryGetValue(BackupSkipProvidedIncrementKey, out var value) || value is not int || attempt < 2) { ctx[BackupSkipProvidedIncrementKey] = value = 0; } @@ -591,7 +655,7 @@ IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithExponentialBacko public readonly PolicyBuilder policyBuilder; public readonly Defaults Defaults; - internal PolicyBuilderBase(PolicyBuilder policyBuilder, Defaults defaults) + public PolicyBuilderBase(PolicyBuilder policyBuilder, Defaults defaults) { ArgumentNullException.ThrowIfNull(policyBuilder); ArgumentNullException.ThrowIfNull(defaults); @@ -612,8 +676,8 @@ IPolicyBuilderWait IPolicyBuilderBase.WithCustomizedRetryPolic => new PolicyBuilderWait(this, maxRetryCount, (attempt, outcome, _1) => sleepDurationProvider(attempt, outcome), default); /// - IPolicyBuilderWait IPolicyBuilderBase.WithExponentialBackoffWait(int retryCount, double exponentialBackOffExponent) - => new PolicyBuilderWait(this, retryCount, default, PolicyBuilderBase.ExponentialSleepDurationProvider(exponentialBackOffExponent)); + IPolicyBuilderWait IPolicyBuilderBase.WithExponentialBackoffWait(int maxRetryCount, double exponentialBackOffExponent) + => new PolicyBuilderWait(this, maxRetryCount, default, PolicyBuilderBase.ExponentialSleepDurationProvider(exponentialBackOffExponent)); /// IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func sleepDurationProvider, bool backupSkipProvidedIncrements) @@ -628,13 +692,13 @@ IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWi { public readonly PolicyBuilderBase policyBuilder; public readonly Func sleepDurationProvider; - public readonly int retryCount; + public readonly int maxRetryCount; - internal PolicyBuilderWait(PolicyBuilderBase policyBuilder, int retryCount, Func sleepDurationProvider) + public PolicyBuilderWait(PolicyBuilderBase policyBuilder, int maxRetryCount, Func sleepDurationProvider) { ArgumentNullException.ThrowIfNull(sleepDurationProvider); this.policyBuilder = policyBuilder; - this.retryCount = retryCount; + this.maxRetryCount = maxRetryCount; this.sleepDurationProvider = sleepDurationProvider; } @@ -648,12 +712,12 @@ IPolicyBuilderBuild IPolicyBuilderWait.SetOnRetryBehavior(ILogger logger, OnRetr public readonly PolicyBuilderBase policyBuilder; public readonly Func sleepDurationProvider; public readonly Func, Context, TimeSpan> genericSleepDurationProvider; - public readonly int retryCount; + public readonly int maxRetryCount; - private static Func, Context, TimeSpan> PickSetProvider(Func, Context, TimeSpan> tResultProvider, Func exceptionProvider) - => (attempt, outcome, ctx) => tResultProvider is null ? exceptionProvider(attempt, outcome.Exception, ctx) : tResultProvider(attempt, outcome, ctx); + private static Func, Context, TimeSpan> PickSleepDurationProvider(Func, Context, TimeSpan> tResultProvider, Func exceptionProvider) + => tResultProvider is null ? (attempt, outcome, ctx) => exceptionProvider(attempt, outcome.Exception, ctx) : tResultProvider; - internal PolicyBuilderWait(PolicyBuilderBase policyBuilder, int retryCount, Func, Context, TimeSpan> sleepDurationProviderResult, Func sleepDurationProviderException) + public PolicyBuilderWait(PolicyBuilderBase policyBuilder, int maxRetryCount, Func, Context, TimeSpan> sleepDurationProviderResult, Func sleepDurationProviderException) { if (sleepDurationProviderException is null && sleepDurationProviderResult is null) { @@ -661,14 +725,14 @@ internal PolicyBuilderWait(PolicyBuilderBase policyBuilder, int retryCo } this.policyBuilder = policyBuilder; - this.retryCount = retryCount; + this.maxRetryCount = maxRetryCount; this.sleepDurationProvider = sleepDurationProviderException; this.genericSleepDurationProvider = sleepDurationProviderResult; } /// IPolicyBuilderBuild IPolicyBuilderWait.SetOnRetryBehavior(ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) - => new PolicyBuilderBuild(this, PickSetProvider(genericSleepDurationProvider, sleepDurationProvider), logger, onRetry, onRetryAsync); + => new PolicyBuilderBuild(this, PickSleepDurationProvider(genericSleepDurationProvider, sleepDurationProvider), logger, onRetry, onRetryAsync); } private readonly struct PolicyBuilderBuild : IPolicyBuilderBuild @@ -682,7 +746,7 @@ IPolicyBuilderBuild IPolicyBuilderWait.SetOnRetryBehavior(ILog /// public RetryHandler PolicyBuilderBase { get; } - internal PolicyBuilderBuild(PolicyBuilderWait policyBuilder, Func sleepDurationProvider, ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) + public PolicyBuilderBuild(PolicyBuilderWait policyBuilder, Func sleepDurationProvider, ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) { ArgumentNullException.ThrowIfNull(sleepDurationProvider); this.policyBuilder = policyBuilder; @@ -727,30 +791,30 @@ public static Func OnRetryHandlerAsync( } /// - ISyncPolicy IPolicyBuilderBuild.BuildPolicy() + ISyncPolicy IPolicyBuilderBuild.SyncBuildPolicy() { var waitProvider = sleepDurationProvider; var onRetryProvider = OnRetryHandler(logger, onRetryHandler); - return policyBuilder.policyBuilder.policyBuilder.WaitAndRetry(policyBuilder.retryCount, (attempt, ctx) => waitProvider(attempt, default, ctx), onRetryProvider); + return policyBuilder.policyBuilder.policyBuilder.WaitAndRetry(policyBuilder.maxRetryCount, (attempt, ctx) => waitProvider(attempt, default, ctx), onRetryProvider); } /// - IAsyncPolicy IPolicyBuilderBuild.BuildAsyncPolicy() + IAsyncPolicy IPolicyBuilderBuild.AsyncBuildPolicy() { var waitProvider = sleepDurationProvider; var onRetryProvider = OnRetryHandlerAsync(logger, onRetryHandler, onRetryHandlerAsync); - return policyBuilder.policyBuilder.policyBuilder.WaitAndRetryAsync(policyBuilder.retryCount, waitProvider, onRetryProvider); + return policyBuilder.policyBuilder.policyBuilder.WaitAndRetryAsync(policyBuilder.maxRetryCount, waitProvider, onRetryProvider); } /// - RetryHandlerPolicy IPolicyBuilderBuild.Build() - => new(((IPolicyBuilderBuild)this).BuildPolicy()); + RetryHandlerPolicy IPolicyBuilderBuild.SyncBuild() + => new(((IPolicyBuilderBuild)this).SyncBuildPolicy()); /// - AsyncRetryHandlerPolicy IPolicyBuilderBuild.BuildAsync() - => new(((IPolicyBuilderBuild)this).BuildAsyncPolicy()); + AsyncRetryHandlerPolicy IPolicyBuilderBuild.AsyncBuild() + => new(((IPolicyBuilderBuild)this).AsyncBuildPolicy()); } private readonly struct PolicyBuilderBuild : IPolicyBuilderBuild @@ -764,7 +828,7 @@ AsyncRetryHandlerPolicy IPolicyBuilderBuild.BuildAsync() /// public RetryHandler PolicyBuilderBase { get; } - internal PolicyBuilderBuild(PolicyBuilderWait policyBuilder, Func, Context, TimeSpan> sleepDurationProvider, ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) + public PolicyBuilderBuild(PolicyBuilderWait policyBuilder, Func, Context, TimeSpan> sleepDurationProvider, ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) { ArgumentNullException.ThrowIfNull(sleepDurationProvider); this.policyBuilder = policyBuilder; @@ -803,16 +867,16 @@ private static Action, TimeSpan, int, Context> Logger(IL private static Func, TimeSpan, int, Context, Task> OnRetryHandlerAsync(ILogger logger, OnRetryHandler onRetryHandler, OnRetryHandlerAsync onRetryHandlerAsync) { - var genericHandler = onRetryHandler ?? new((outcome, timeSpan, retryCount, correlationId, caller) => { }); - var genericHandlerAsync = onRetryHandlerAsync ?? new((outcome, timespan, retryCount, correlationId, caller) => + var handler = onRetryHandler ?? new((outcome, timeSpan, retryCount, correlationId, caller) => { }); + var asyncHandler = onRetryHandlerAsync ?? new((outcome, timespan, retryCount, correlationId, caller) => { - genericHandler(outcome, timespan, retryCount, correlationId, caller); - return Task.CompletedTask; + handler(outcome, timespan, retryCount, correlationId, caller); + return Task.CompletedTask; }); return async (outcome, timespan, retryCount, ctx) => { - await genericHandlerAsync(outcome, timespan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); + await asyncHandler(outcome, timespan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); Logger(logger)(outcome, timespan, retryCount, ctx); }; } @@ -827,12 +891,12 @@ private static Func, TimeSpan, int, Context, Task> OnRet //} /// - IAsyncPolicy IPolicyBuilderBuild.BuildAsyncPolicy() + IAsyncPolicy IPolicyBuilderBuild.AsyncBuildPolicy() { var waitProvider = sleepDurationProvider; var onRetryProvider = OnRetryHandlerAsync(logger, onRetryHandler, onRetryHandlerAsync); - return policyBuilder.policyBuilder.policyBuilder.WaitAndRetryAsync(policyBuilder.retryCount, waitProvider, onRetryProvider); + return policyBuilder.policyBuilder.policyBuilder.WaitAndRetryAsync(policyBuilder.maxRetryCount, waitProvider, onRetryProvider); } ///// @@ -840,8 +904,8 @@ IAsyncPolicy IPolicyBuilderBuild.BuildAsyncPolicy() // => new(((IPolicyBuilderBuild)this).BuildPolicy()); /// - AsyncRetryHandlerPolicy IPolicyBuilderBuild.BuildAsync() - => new(((IPolicyBuilderBuild)this).BuildAsyncPolicy()); + AsyncRetryHandlerPolicy IPolicyBuilderBuild.AsyncBuild() + => new(((IPolicyBuilderBuild)this).AsyncBuildPolicy()); } } #endregion diff --git a/src/Tes.ApiClients/Tes.ApiClients.csproj b/src/Tes.ApiClients/Tes.ApiClients.csproj index 84816dba0..1d3fd2d52 100644 --- a/src/Tes.ApiClients/Tes.ApiClients.csproj +++ b/src/Tes.ApiClients/Tes.ApiClients.csproj @@ -9,7 +9,6 @@ - @@ -25,9 +24,6 @@ <_Parameter1>Tes.ApiClients.Tests - - <_Parameter1>DynamicProxyGenAssembly2 - diff --git a/src/Tes.Runner/Docker/DockerExecutor.cs b/src/Tes.Runner/Docker/DockerExecutor.cs index 00f14e654..1a83e8f6d 100644 --- a/src/Tes.Runner/Docker/DockerExecutor.cs +++ b/src/Tes.Runner/Docker/DockerExecutor.cs @@ -17,9 +17,7 @@ public class DockerExecutor private readonly IDockerClient dockerClient = null!; private readonly ILogger logger = PipelineLoggerFactory.Create(); private readonly NetworkUtility networkUtility = new(); - private readonly RetryHandler.AsyncRetryHandlerPolicy asyncRetryPolicy = - new RetryHandler(Options.Create(new RetryPolicyOptions())) - .RetryDefaultPolicyBuilder().SetOnRetryBehavior().BuildAsync(); + private readonly RetryHandler.AsyncRetryHandlerPolicy asyncRetryPolicy = null!; private readonly IStreamLogReader streamLogReader = null!; const int LogStreamingMaxWaitTimeInSeconds = 30; @@ -30,6 +28,7 @@ public DockerExecutor(Uri dockerHost) : this(new DockerClientConfiguration(docke } public DockerExecutor(IDockerClient dockerClient, IStreamLogReader streamLogReader) + : this() // Add logging to retries { ArgumentNullException.ThrowIfNull(dockerClient); ArgumentNullException.ThrowIfNull(streamLogReader); @@ -43,7 +42,9 @@ public DockerExecutor(IDockerClient dockerClient, IStreamLogReader streamLogRead /// protected DockerExecutor() { - + this.asyncRetryPolicy = + new RetryHandler(Options.Create(new RetryPolicyOptions())) + .DefaultRetryPolicyBuilder().SetOnRetryBehavior(logger).AsyncBuild(); } public virtual async Task RunOnContainerAsync(string? imageName, string? tag, List? commandsToExecute, List? volumeBindings, string? workingDir) @@ -129,13 +130,13 @@ private async Task PullImageWithRetriesAsync(string imageName, string? tag, Auth { logger.LogInformation($"Pulling image name: {imageName} image tag: {tag}"); - await asyncRetryPolicy.ExecuteWithRetryAsync(async _ => + await asyncRetryPolicy.ExecuteWithRetryAsync(async () => { await dockerClient.Images.CreateImageAsync( new ImagesCreateParameters() { FromImage = imageName, Tag = tag }, authConfig, new Progress(message => logger.LogDebug(message.Status))); - }, CancellationToken.None); + }); } private async Task DeleteAllImagesAsync() diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index aae9401f7..fbae23fcd 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -90,11 +90,11 @@ public AzureProxy(IOptions batchAccountOptions, IBatchPoolM batchRetryPolicyWhenJobNotFound = builder .SetOnRetryBehavior(onRetry: OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenJobNotFound(LogRetryErrorOnRetryHandler())) - .BuildAsync(); + .AsyncBuild(); batchRetryPolicyWhenNodeNotReady = builder .SetOnRetryBehavior(onRetry: OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenNodeNotReady(LogRetryErrorOnRetryHandler())) - .BuildAsync(); + .AsyncBuild(); if (!string.IsNullOrWhiteSpace(batchAccountOptions.Value.AppKey)) { diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 184cb311c..debbdddfd 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -44,7 +44,7 @@ public CachingWithRetriesAzureProxy(IAzureProxy azureProxy, CachingRetryHandler this.azureProxy = azureProxy; this.logger = logger; - var builder = cachingRetryHandler.RetryDefaultPolicyBuilder(); + var builder = cachingRetryHandler.DefaultRetryPolicyBuilder(); this.cachingRetry = builder.SetOnRetryBehavior(this.logger).AddCaching().Build(); this.cachingAsyncRetry = builder.SetOnRetryBehavior(this.logger).AddCaching().BuildAsync(); this.cachingAsyncRetryExceptWhenExists = builder diff --git a/src/TesApi.Web/Management/AzureProvider.cs b/src/TesApi.Web/Management/AzureProvider.cs index 7e9632ec5..6cfb421d9 100644 --- a/src/TesApi.Web/Management/AzureProvider.cs +++ b/src/TesApi.Web/Management/AzureProvider.cs @@ -31,7 +31,7 @@ protected AzureProvider(CachingRetryHandler cachingRetryHandler, AzureManagement this.ManagementClientsFactory = managementClientsFactory; this.Logger = logger; this.CachingAsyncRetryPolicy = cachingRetryHandler - .RetryDefaultPolicyBuilder() + .DefaultRetryPolicyBuilder() .SetOnRetryBehavior(this.Logger) .AddCaching() .BuildAsync(); From 3f577ebc2990c1b55d935191f2ad786578b1b863 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 1 Dec 2023 13:35:14 -0800 Subject: [PATCH 104/202] format and refactor for clarity --- .../CacheAndRetryHandlerTest.cs | 109 +++++------ .../TerraWsmApiClientTests.cs | 6 +- .../TestServices/RetryHandlersHelpers.cs | 4 +- src/Tes.ApiClients/CachingRetryHandler.cs | 18 +- src/Tes.ApiClients/HttpApiClient.cs | 2 +- src/TesApi.Web/AzureProxy.cs | 52 ++---- .../CachingWithRetriesAzureProxy.cs | 174 +++++------------- src/TesApi.Web/Management/AzureProvider.cs | 2 +- 8 files changed, 125 insertions(+), 242 deletions(-) diff --git a/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs b/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs index 7a0a2697f..1810dfe21 100644 --- a/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs +++ b/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs @@ -14,7 +14,7 @@ public class CacheAndRetryHandlerTest { private IMemoryCache appCache = null!; private CachingRetryHandler.CachingAsyncRetryHandlerPolicy cachingAsyncPolicy = null!; - private RetryHandler.AsyncRetryHandlerPolicy cachingAsyncHttpResponseMessagePolicy = null!; + private CachingRetryHandler.CachingAsyncRetryHandlerPolicy cachingAsyncHttpResponseMessagePolicy = null!; private Mock mockInstanceToRetry = null!; private const int MaxRetryCount = 3; @@ -30,14 +30,14 @@ public void SetUp() cachingAsyncHttpResponseMessagePolicy = cachingRetryHandler .DefaultRetryHttpResponseMessagePolicyBuilder() .SetOnRetryBehavior() - //.AddCaching() + .AddCaching() .AsyncBuild(); cachingAsyncPolicy = cachingRetryHandler .DefaultRetryPolicyBuilder() .SetOnRetryBehavior() .AddCaching() - .BuildAsync(); + .AsyncBuild(); } [TestCleanup] @@ -105,62 +105,63 @@ public async Task ExecuteHttpRequestWithRetryAsync_RetriesOnlyOnExpectedFailureC mockFactory.Setup(f => f.CreateResponseAsync()).Returns(CreateResponseAsync(statusCode)); var response = - await cachingAsyncHttpResponseMessagePolicy.ExecuteWithRetryAsync( - mockFactory.Object.CreateResponseAsync); + await cachingAsyncHttpResponseMessagePolicy.ExecuteWithRetryAsync(_ => + mockFactory.Object.CreateResponseAsync(), + CancellationToken.None); mockFactory.Verify(f => f.CreateResponseAsync(), Times.Exactly(numberOfTimes)); Assert.AreEqual(response.StatusCode, statusCode); } - // [TestMethod] - // [DataRow(HttpStatusCode.OK)] - // [DataRow(HttpStatusCode.Created)] - // [DataRow(HttpStatusCode.Accepted)] - // [DataRow(HttpStatusCode.PartialContent)] - // [DataRow(HttpStatusCode.NoContent)] - // public async Task ExecuteHttpRequestWithRetryAndCachingAsync_CallOnceCachesOnSuccess(HttpStatusCode statusCode) - // { - // var cacheKey = Guid.NewGuid().ToString(); - // var mockFactory = new Mock(); - // mockFactory.Setup(f => f.CreateResponseAsync()).Returns(CreateResponseAsync(statusCode)); - // - // var first = - // await cacheAndRetryHandler.ExecuteHttpRequestWithRetryAndCachingAsync(cacheKey, _ => - // mockFactory.Object.CreateResponseAsync(), - // System.Threading.CancellationToken.None); - // - // var second = - // await cacheAndRetryHandler.ExecuteHttpRequestWithRetryAndCachingAsync(cacheKey, _ => - // mockFactory.Object.CreateResponseAsync(), - // System.Threading.CancellationToken.None); - // - // mockFactory.Verify(f => f.CreateResponseAsync(), Times.Once); - // Assert.AreEqual(first.StatusCode, statusCode); - // Assert.AreEqual(second.StatusCode, statusCode); - // Assert.IsTrue(appCache.TryGetValue(cacheKey, out HttpResponseMessage? cachedResponse)); - // Assert.AreEqual(first.StatusCode, cachedResponse!.StatusCode); - // } - - // [TestMethod] - // [DataRow(HttpStatusCode.Forbidden, 1)] //bad codes but not retriable - // [DataRow(HttpStatusCode.BadRequest, 1)] - // [DataRow(HttpStatusCode.NotFound, 1)] - // [DataRow(HttpStatusCode.Conflict, 1)] - // [DataRow(HttpStatusCode.BadGateway, MaxRetryCount + 1)] //retriable codes - // [DataRow(HttpStatusCode.TooManyRequests, MaxRetryCount + 1)] - // [DataRow(HttpStatusCode.ServiceUnavailable, MaxRetryCount + 1)] - // [DataRow(HttpStatusCode.InternalServerError, MaxRetryCount + 1)] - // public async Task ExecuteHttpRequestWithRetryAndCachingAsync_RetriesThrowsAndNotCachedOnFailure(HttpStatusCode statusCode, int numberOfTimes) - // { - // var cacheKey = Guid.NewGuid().ToString(); - // var mockFactory = new Mock(); - // mockFactory.Setup(f => f.CreateResponseAsync()).Returns(CreateResponseAsync(statusCode)); - // - // await Assert.ThrowsExceptionAsync(() => cacheAndRetryHandler.ExecuteHttpRequestWithRetryAndCachingAsync(cacheKey, _ => mockFactory.Object.CreateResponseAsync(), System.Threading.CancellationToken.None)); - // - // mockFactory.Verify(f => f.CreateResponseAsync(), Times.Exactly(numberOfTimes)); - // Assert.IsFalse(appCache.TryGetValue(cacheKey, out HttpResponseMessage _)); - // } + [TestMethod] + [DataRow(HttpStatusCode.OK)] + [DataRow(HttpStatusCode.Created)] + [DataRow(HttpStatusCode.Accepted)] + [DataRow(HttpStatusCode.PartialContent)] + [DataRow(HttpStatusCode.NoContent)] + public async Task ExecuteHttpRequestWithRetryAndCachingAsync_CallOnceCachesOnSuccess(HttpStatusCode statusCode) + { + var cacheKey = Guid.NewGuid().ToString(); + var mockFactory = new Mock(); + mockFactory.Setup(f => f.CreateResponseAsync()).Returns(CreateResponseAsync(statusCode)); + + var first = + await cachingAsyncHttpResponseMessagePolicy.ExecuteWithRetryAndCachingAsync(cacheKey, _ => + mockFactory.Object.CreateResponseAsync(), + CancellationToken.None); + + var second = + await cachingAsyncHttpResponseMessagePolicy.ExecuteWithRetryAndCachingAsync(cacheKey, _ => + mockFactory.Object.CreateResponseAsync(), + CancellationToken.None); + + mockFactory.Verify(f => f.CreateResponseAsync(), Times.Once); + Assert.AreEqual(first.StatusCode, statusCode); + Assert.AreEqual(second.StatusCode, statusCode); + Assert.IsTrue(appCache.TryGetValue(cacheKey, out HttpResponseMessage? cachedResponse)); + Assert.AreEqual(first.StatusCode, cachedResponse!.StatusCode); + } + + [TestMethod] + [DataRow(HttpStatusCode.Forbidden, 1)] //bad codes but not retriable + [DataRow(HttpStatusCode.BadRequest, 1)] + [DataRow(HttpStatusCode.NotFound, 1)] + [DataRow(HttpStatusCode.Conflict, 1)] + [DataRow(HttpStatusCode.BadGateway, MaxRetryCount + 1)] //retriable codes + [DataRow(HttpStatusCode.TooManyRequests, MaxRetryCount + 1)] + [DataRow(HttpStatusCode.ServiceUnavailable, MaxRetryCount + 1)] + [DataRow(HttpStatusCode.InternalServerError, MaxRetryCount + 1)] + public async Task ExecuteHttpRequestWithRetryAndCachingAsync_RetriesThrowsAndNotCachedOnFailure(HttpStatusCode statusCode, int numberOfTimes) + { + var cacheKey = Guid.NewGuid().ToString(); + var mockFactory = new Mock(); + mockFactory.Setup(f => f.CreateResponseAsync()).Returns(CreateResponseAsync(statusCode)); + + await Assert.ThrowsExceptionAsync(() => cachingAsyncHttpResponseMessagePolicy.ExecuteWithRetryConversionAndCachingAsync(cacheKey, _ => mockFactory.Object.CreateResponseAsync(), (m, _) => Task.FromResult(m.EnsureSuccessStatusCode()), CancellationToken.None)); + + mockFactory.Verify(f => f.CreateResponseAsync(), Times.Exactly(numberOfTimes)); + Assert.IsFalse(appCache.TryGetValue(cacheKey, out HttpResponseMessage? _)); + } private static Task CreateResponseAsync(HttpStatusCode statusCode) => Task.FromResult(new(statusCode)); diff --git a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs index 880dbdd50..d71a5f691 100644 --- a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs +++ b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs @@ -113,8 +113,7 @@ public async Task GetContainerResourcesAsync_ValidRequest_ReturnsPayload() Content = new StringContent(terraApiStubData.GetContainerResourcesApiResponseInJson()) }; - asyncResponseRetryPolicy.Value - .Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); var apiResponse = await terraWsmApiClient.GetContainerResourcesAsync(terraApiStubData.WorkspaceId, @@ -131,8 +130,7 @@ public async Task DeleteBatchPoolAsync_204Response_Succeeds() var wsmResourceId = Guid.NewGuid(); var response = new HttpResponseMessage(HttpStatusCode.NoContent); - asyncResponseRetryPolicy.Value - .Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); await terraWsmApiClient.DeleteBatchPoolAsync(terraApiStubData.WorkspaceId, wsmResourceId, CancellationToken.None); diff --git a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs index 7b72f250a..67fc72400 100644 --- a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs +++ b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs @@ -20,7 +20,7 @@ internal static Mock GetCachingAsyncRetryPolicyM var cachingAsyncRetryPolicy = new Mock(); _ = cachingAsyncRetryPolicy.As(); var cachingPolicyBuild = new Mock(); - cachingPolicyBuild.Setup(policy => policy.BuildAsync()) + cachingPolicyBuild.Setup(policy => policy.AsyncBuild()) .Returns(cachingAsyncRetryPolicy.Object); cachingRetryHandler.As().Setup(policy => policy.CachingPolicyBuilder(It.IsAny())) .Returns(cachingPolicyBuild.Object); @@ -48,7 +48,7 @@ internal static Mock> GetCachingAsyncRet var cachingAsyncRetryPolicy = new Mock>(); _ = cachingAsyncRetryPolicy.As>(); var cachingPolicyBuild = new Mock>(); - cachingPolicyBuild.Setup(policy => policy.BuildAsync()) + cachingPolicyBuild.Setup(policy => policy.AsyncBuild()) .Returns(cachingAsyncRetryPolicy.Object); cachingRetryHandler.As().Setup(policy => policy.CachingPolicyBuilder(It.IsAny>())) .Returns(cachingPolicyBuild.Object); diff --git a/src/Tes.ApiClients/CachingRetryHandler.cs b/src/Tes.ApiClients/CachingRetryHandler.cs index 8090ddcc0..ad5ae69bd 100644 --- a/src/Tes.ApiClients/CachingRetryHandler.cs +++ b/src/Tes.ApiClients/CachingRetryHandler.cs @@ -231,13 +231,13 @@ public interface ICachingPolicyBuilderBuild /// Builds with caching. /// /// Caching retry policy. - CachingRetryHandlerPolicy Build(); + CachingRetryHandlerPolicy SyncBuild(); /// /// Builds with caching. /// /// Caching retry policy. - CachingAsyncRetryHandlerPolicy BuildAsync(); + CachingAsyncRetryHandlerPolicy AsyncBuild(); } public interface ICachingPolicyBuilderBuild @@ -246,13 +246,13 @@ public interface ICachingPolicyBuilderBuild ///// Builds with caching. ///// ///// Caching retry policy. - //CachingRetryHandlerPolicy Build(); + //CachingRetryHandlerPolicy SyncBuild(); /// /// Builds with caching. /// /// Caching retry policy. - CachingAsyncRetryHandlerPolicy BuildAsync(); + CachingAsyncRetryHandlerPolicy AsyncBuild(); } /// Used internally and for testing. @@ -284,12 +284,12 @@ public CachingPolicyBuilderBuild(IPolicyBuilderBuild policyBuilder, CachingRetry this.cachingHandler = handler; } - CachingRetryHandlerPolicy ICachingPolicyBuilderBuild.Build() + CachingRetryHandlerPolicy ICachingPolicyBuilderBuild.SyncBuild() { return new(policyBuilder.SyncBuildPolicy(), cachingHandler.AppCache); } - CachingAsyncRetryHandlerPolicy ICachingPolicyBuilderBuild.BuildAsync() + CachingAsyncRetryHandlerPolicy ICachingPolicyBuilderBuild.AsyncBuild() { return new(policyBuilder.AsyncBuildPolicy(), cachingHandler.AppCache); } @@ -307,12 +307,12 @@ public CachingPolicyBuilderBuild(IPolicyBuilderBuild policyBuilder, Cac this.cachingHandler = handler; } - //CachingRetryHandlerPolicy ICachingPolicyBuilderBuild.Build() + //CachingRetryHandlerPolicy ICachingPolicyBuilderBuild.SyncBuild() //{ - // return new(policyBuilder.BuildPolicy(), cachingHandler.AppCache); + // return new(policyBuilder.SyncBuildPolicy(), cachingHandler.AppCache); //} - CachingAsyncRetryHandlerPolicy ICachingPolicyBuilderBuild.BuildAsync() + CachingAsyncRetryHandlerPolicy ICachingPolicyBuilderBuild.AsyncBuild() { return new(policyBuilder.AsyncBuildPolicy(), cachingHandler.AppCache); } diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index 680f50f7d..fd3aa2792 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -49,7 +49,7 @@ protected HttpApiClient(CachingRetryHandler cachingRetryHandler, ILogger logger) .DefaultRetryHttpResponseMessagePolicyBuilder() .SetOnRetryBehavior(onRetry: LogRetryErrorOnRetryHttpResponseMessageHandler()) .AddCaching() - .BuildAsync(); + .AsyncBuild(); } /// diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index fbae23fcd..9e854ca1b 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -84,16 +84,16 @@ public AzureProxy(IOptions batchAccountOptions, IBatchPoolM throw new InvalidOperationException("The batch account name is missing from the the configuration."); } - var builder = retryHandler.PolicyBuilder - .OpinionatedRetryPolicy(Policy.Handle()) - .WithExceptionBasedWaitWithRetryPolicyOptionsBackup((attempt, exception) => (exception as BatchException)?.RequestInformation?.RetryAfter, true); - - batchRetryPolicyWhenJobNotFound = builder - .SetOnRetryBehavior(onRetry: OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenJobNotFound(LogRetryErrorOnRetryHandler())) + batchRetryPolicyWhenJobNotFound = retryHandler.PolicyBuilder + .OpinionatedRetryPolicy(Policy.Handle(ex => BatchErrorCodeStrings.JobNotFound.Equals(ex.RequestInformation.BatchError.Code, StringComparison.OrdinalIgnoreCase))) + .WithExceptionBasedWaitWithRetryPolicyOptionsBackup((attempt, exception) => (exception as BatchException)?.RequestInformation?.RetryAfter, backupSkipProvidedIncrements: true) + .SetOnRetryBehavior(onRetry: LogRetryErrorOnRetryHandler()) .AsyncBuild(); - batchRetryPolicyWhenNodeNotReady = builder - .SetOnRetryBehavior(onRetry: OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenNodeNotReady(LogRetryErrorOnRetryHandler())) + batchRetryPolicyWhenNodeNotReady = retryHandler.PolicyBuilder + .OpinionatedRetryPolicy(Policy.Handle(ex => "NodeNotReady".Equals(ex.RequestInformation.BatchError.Code, StringComparison.OrdinalIgnoreCase))) + .WithExceptionBasedWaitWithRetryPolicyOptionsBackup((attempt, exception) => (exception as BatchException)?.RequestInformation?.RetryAfter, backupSkipProvidedIncrements: true) + .SetOnRetryBehavior(onRetry: LogRetryErrorOnRetryHandler()) .AsyncBuild(); if (!string.IsNullOrWhiteSpace(batchAccountOptions.Value.AppKey)) @@ -125,37 +125,7 @@ public AzureProxy(IOptions batchAccountOptions, IBatchPoolM } /// - /// Rethrows exception if exception is and the Batch API Error Code returned otherwise invokes . - /// - /// Polly retry handler. - /// - private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenJobNotFound(RetryHandler.OnRetryHandler OnRetry) - => new((outcome, timespan, retryCount, correlationId, caller) => - { - if (outcome is BatchException batchException && batchException.RequestInformation?.BatchError?.Code != BatchErrorCodeStrings.JobNotFound) - { - System.Runtime.ExceptionServices.ExceptionDispatchInfo.Capture(outcome).Throw(); - } - OnRetry?.Invoke(outcome, timespan, retryCount, correlationId, caller); - }); - - /// - /// Rethrows exception if exception is and the Batch API Error Code returned "NodeNotReady" otherwise invokes . - /// - /// Polly retry handler. - /// - private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionWhenNodeNotReady(RetryHandler.OnRetryHandler OnRetry) - => new((outcome, timespan, retryCount, correlationId, caller) => - { - if (outcome is BatchException batchException && !"NodeNotReady".Equals(batchException.RequestInformation?.BatchError?.Code, StringComparison.InvariantCultureIgnoreCase)) - { - System.Runtime.ExceptionServices.ExceptionDispatchInfo.Capture(outcome).Throw(); - } - OnRetry?.Invoke(outcome, timespan, retryCount, correlationId, caller); - }); - - /// - /// A logging Polly retry handler. + /// A logging retry handler. /// /// private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler() @@ -163,8 +133,8 @@ private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler() { var requestId = (exception as BatchException)?.RequestInformation?.ServiceRequestId ?? "n/a"; var reason = (exception.InnerException as Microsoft.Azure.Batch.Protocol.Models.BatchErrorException)?.Response?.ReasonPhrase ?? "n/a"; - logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan} BatchErrorCode: '{BatchErrorCode}', ApiStatusCode '{ApiStatusCode}', Reason: '{ReasonPhrase}' ServiceRequestId: '{ServiceRequestId}', CorrelationId: {CorrelationId}", - caller, retryCount, timeSpan.ToString("c"), (exception as BatchException)?.RequestInformation?.BatchError?.Code ?? "n/a", (exception as BatchException)?.RequestInformation?.HttpStatusCode?.ToString("G") ?? "n/a", reason, requestId, correlationId.ToString("D")); + logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} RetryCount: {TimeSpan:c} BatchErrorCode: '{BatchErrorCode}', ApiStatusCode '{ApiStatusCode}', Reason: '{ReasonPhrase}' ServiceRequestId: '{ServiceRequestId}', CorrelationId: {CorrelationId:D}", + caller, retryCount, timeSpan, (exception as BatchException)?.RequestInformation?.BatchError?.Code ?? "n/a", (exception as BatchException)?.RequestInformation?.HttpStatusCode?.ToString("G") ?? "n/a", reason, requestId, correlationId); }); // TODO: Static method because the instrumentation key is needed in both Program.cs and Startup.cs and we wanted to avoid intializing the batch client twice. diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index debbdddfd..fe07f6253 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -4,7 +4,6 @@ using System; using System.Collections.Generic; using System.Linq; -using System.Runtime.ExceptionServices; using System.Threading; using System.Threading.Tasks; using Microsoft.Azure.Batch; @@ -44,15 +43,21 @@ public CachingWithRetriesAzureProxy(IAzureProxy azureProxy, CachingRetryHandler this.azureProxy = azureProxy; this.logger = logger; - var builder = cachingRetryHandler.DefaultRetryPolicyBuilder(); - this.cachingRetry = builder.SetOnRetryBehavior(this.logger).AddCaching().Build(); - this.cachingAsyncRetry = builder.SetOnRetryBehavior(this.logger).AddCaching().BuildAsync(); - this.cachingAsyncRetryExceptWhenExists = builder - .SetOnRetryBehavior(this.logger, OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists()) - .AddCaching().BuildAsync(); - this.cachingAsyncRetryExceptWhenNotFound = builder - .SetOnRetryBehavior(this.logger, OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound()) - .AddCaching().BuildAsync(); + var sleepDuration = new Func((attempt, exception) => (exception as BatchException)?.RequestInformation?.RetryAfter); + + this.cachingRetry = cachingRetryHandler.PolicyBuilder.OpinionatedRetryPolicy() + .WithExceptionBasedWaitWithRetryPolicyOptionsBackup(sleepDuration, backupSkipProvidedIncrements: true).SetOnRetryBehavior(this.logger).AddCaching().SyncBuild(); + + this.cachingAsyncRetry = cachingRetryHandler.PolicyBuilder.OpinionatedRetryPolicy() + .WithExceptionBasedWaitWithRetryPolicyOptionsBackup(sleepDuration, backupSkipProvidedIncrements: true).SetOnRetryBehavior(this.logger).AddCaching().AsyncBuild(); + + this.cachingAsyncRetryExceptWhenExists = cachingRetryHandler.PolicyBuilder + .OpinionatedRetryPolicy(Polly.Policy.Handle(ex => !CreationErrorFoundCodes.Contains(ex.RequestInformation?.BatchError?.Code, StringComparer.OrdinalIgnoreCase))) + .WithExceptionBasedWaitWithRetryPolicyOptionsBackup(sleepDuration, backupSkipProvidedIncrements: true).SetOnRetryBehavior(this.logger).AddCaching().AsyncBuild(); + + this.cachingAsyncRetryExceptWhenNotFound = cachingRetryHandler.PolicyBuilder + .OpinionatedRetryPolicy(Polly.Policy.Handle(ex => !DeletionErrorFoundCodes.Contains(ex.RequestInformation?.BatchError?.Code, StringComparer.OrdinalIgnoreCase))) + .WithExceptionBasedWaitWithRetryPolicyOptionsBackup(sleepDuration, backupSkipProvidedIncrements: true).SetOnRetryBehavior(this.logger).AddCaching().AsyncBuild(); } private static readonly string[] CreationErrorFoundCodes = new[] @@ -69,32 +74,6 @@ public CachingWithRetriesAzureProxy(IAzureProxy azureProxy, CachingRetryHandler BatchErrorCodeStrings.JobNotFound }; - /// - /// Rethrows exception if exception is and the Batch API call returned otherwise invokes 's logger. - /// - /// - private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenExists() - => new((exception, _, _, _, _) => - { - if (exception is BatchException batchException && CreationErrorFoundCodes.Contains(batchException.RequestInformation?.BatchError?.Code, StringComparer.OrdinalIgnoreCase)) - { - ExceptionDispatchInfo.Capture(exception).Throw(); - } - }); - - /// - /// Rethrows exception if exception is and the Batch API call returned otherwise invokes 's logger. - /// - /// - private static RetryHandler.OnRetryHandler OnRetryMicrosoftAzureBatchCommonBatchExceptionExceptWhenNotFound() - => new((exception, _, _, _, _) => - { - if (exception is BatchException batchException && DeletionErrorFoundCodes.Contains(batchException.RequestInformation?.BatchError?.Code, StringComparer.OrdinalIgnoreCase)) - { - ExceptionDispatchInfo.Capture(exception).Throw(); - } - }); - /// public Task CreateAutoPoolModeBatchJobAsync(string jobId, CloudTask cloudTask, PoolInformation poolInformation, CancellationToken cancellationToken) => azureProxy.CreateAutoPoolModeBatchJobAsync(jobId, cloudTask, poolInformation, cancellationToken); @@ -166,16 +145,10 @@ public async Task DeleteBatchPoolAsync(string poolId, CancellationToken cancella } /// - public Task GetBatchPoolAsync(string poolId, CancellationToken cancellationToken, DetailLevel detailLevel) - { - return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetBatchPoolAsync(poolId, ct, detailLevel), cancellationToken); - } + public Task GetBatchPoolAsync(string poolId, CancellationToken cancellationToken, DetailLevel detailLevel) => cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetBatchPoolAsync(poolId, ct, detailLevel), cancellationToken); /// - public Task GetBatchJobAsync(string jobId, CancellationToken cancellationToken, DetailLevel detailLevel) - { - return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAsync(jobId, ct, detailLevel), cancellationToken); - } + public Task GetBatchJobAsync(string jobId, CancellationToken cancellationToken, DetailLevel detailLevel) => cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAsync(jobId, ct, detailLevel), cancellationToken); /// public async Task DeleteBatchComputeNodesAsync(string poolId, IEnumerable computeNodes, CancellationToken cancellationToken) @@ -185,85 +158,54 @@ public async Task DeleteBatchComputeNodesAsync(string poolId, IEnumerable - public Task DownloadBlobAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) - { - return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.DownloadBlobAsync(blobAbsoluteUri, ct), cancellationToken); - } + public Task DownloadBlobAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) => cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.DownloadBlobAsync(blobAbsoluteUri, ct), cancellationToken); /// - public Task BlobExistsAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) - { - return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.BlobExistsAsync(blobAbsoluteUri, ct), cancellationToken); - } + public Task BlobExistsAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) => cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.BlobExistsAsync(blobAbsoluteUri, ct), cancellationToken); /// - public Task> GetActivePoolIdsAsync(string prefix, TimeSpan minAge, CancellationToken cancellationToken) - { - return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetActivePoolIdsAsync(prefix, minAge, ct), cancellationToken); - } + public Task> GetActivePoolIdsAsync(string prefix, TimeSpan minAge, CancellationToken cancellationToken) => cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetActivePoolIdsAsync(prefix, minAge, ct), cancellationToken); /// - public IAsyncEnumerable GetActivePoolsAsync(string hostName) - { - return cachingRetry.ExecuteWithRetry(() => azureProxy.GetActivePoolsAsync(hostName)); - } + public IAsyncEnumerable GetActivePoolsAsync(string hostName) => cachingRetry.ExecuteWithRetry(() => azureProxy.GetActivePoolsAsync(hostName)); /// - public int GetBatchActiveJobCount() - { - return cachingRetry.ExecuteWithRetry(azureProxy.GetBatchActiveJobCount); - } + public int GetBatchActiveJobCount() => cachingRetry.ExecuteWithRetry(azureProxy.GetBatchActiveJobCount); /// - public IEnumerable GetBatchActiveNodeCountByVmSize() - { - return cachingRetry.ExecuteWithRetry(azureProxy.GetBatchActiveNodeCountByVmSize); - } + public IEnumerable GetBatchActiveNodeCountByVmSize() => cachingRetry.ExecuteWithRetry(azureProxy.GetBatchActiveNodeCountByVmSize); /// - public int GetBatchActivePoolCount() - { - return cachingRetry.ExecuteWithRetry(azureProxy.GetBatchActivePoolCount); - } + public int GetBatchActivePoolCount() => cachingRetry.ExecuteWithRetry(azureProxy.GetBatchActivePoolCount); /// - public Task GetBatchJobAndTaskStateAsync(Tes.Models.TesTask tesTask, bool usingAutoPools, CancellationToken cancellationToken) - { - return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAndTaskStateAsync(tesTask, usingAutoPools, ct), cancellationToken); - } + public Task GetBatchJobAndTaskStateAsync(Tes.Models.TesTask tesTask, bool usingAutoPools, CancellationToken cancellationToken) => cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetBatchJobAndTaskStateAsync(tesTask, usingAutoPools, ct), cancellationToken); /// - public Task GetNextBatchJobIdAsync(string tesTaskId, CancellationToken cancellationToken) - { - return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetNextBatchJobIdAsync(tesTaskId, ct), cancellationToken); - } + public Task GetNextBatchJobIdAsync(string tesTaskId, CancellationToken cancellationToken) => cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetNextBatchJobIdAsync(tesTaskId, ct), cancellationToken); /// - public Task> GetPoolIdsReferencedByJobsAsync(CancellationToken cancellationToken) - { - return cachingAsyncRetry.ExecuteWithRetryAsync(azureProxy.GetPoolIdsReferencedByJobsAsync, cancellationToken); - } + public Task> GetPoolIdsReferencedByJobsAsync(CancellationToken cancellationToken) => cachingAsyncRetry.ExecuteWithRetryAsync(azureProxy.GetPoolIdsReferencedByJobsAsync, cancellationToken); /// public Task GetStorageAccountKeyAsync(StorageAccountInfo storageAccountInfo, CancellationToken cancellationToken) - { - return cachingAsyncRetry.ExecuteWithRetryAndCachingAsync($"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountInfo.Id}", + => cachingAsyncRetry.ExecuteWithRetryAndCachingAsync($"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountInfo.Id}", ct => azureProxy.GetStorageAccountKeyAsync(storageAccountInfo, ct), DateTimeOffset.Now.AddHours(1), cancellationToken); - } /// public async Task GetStorageAccountInfoAsync(string storageAccountName, CancellationToken cancellationToken) { - var storageAccountInfo = cachingAsyncRetry.AppCache.Get($"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountName}"); + var cacheKey = $"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountName}"; + var storageAccountInfo = cachingAsyncRetry.AppCache.Get(cacheKey); - if (storageAccountInfo is null) + if (storageAccountInfo is null ) { storageAccountInfo = await cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetStorageAccountInfoAsync(storageAccountName, ct), cancellationToken); - if (storageAccountInfo is not null) + if (storageAccountInfo is not null ) { - cachingAsyncRetry.AppCache.Set($"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountName}", storageAccountInfo, DateTimeOffset.MaxValue); + cachingAsyncRetry.AppCache.Set(cacheKey, storageAccountInfo, DateTimeOffset.MaxValue); } } @@ -271,40 +213,22 @@ public async Task GetStorageAccountInfoAsync(string storageA } /// - public Task> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) - { - return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.ListBlobsAsync(directoryUri, ct), cancellationToken); - } + public Task> ListBlobsAsync(Uri directoryUri, CancellationToken cancellationToken) => cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.ListBlobsAsync(directoryUri, ct), cancellationToken); /// - public Task> ListOldJobsToDeleteAsync(TimeSpan oldestJobAge, CancellationToken cancellationToken) - { - return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.ListOldJobsToDeleteAsync(oldestJobAge, ct), cancellationToken); - } + public Task> ListOldJobsToDeleteAsync(TimeSpan oldestJobAge, CancellationToken cancellationToken) => cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.ListOldJobsToDeleteAsync(oldestJobAge, ct), cancellationToken); /// - public Task> ListOrphanedJobsToDeleteAsync(TimeSpan minJobAge, CancellationToken cancellationToken) - { - return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.ListOrphanedJobsToDeleteAsync(minJobAge, ct), cancellationToken); - } + public Task> ListOrphanedJobsToDeleteAsync(TimeSpan minJobAge, CancellationToken cancellationToken) => cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.ListOrphanedJobsToDeleteAsync(minJobAge, ct), cancellationToken); /// - public Task UploadBlobAsync(Uri blobAbsoluteUri, string content, CancellationToken cancellationToken) - { - return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobAsync(blobAbsoluteUri, content, ct), cancellationToken); - } + public Task UploadBlobAsync(Uri blobAbsoluteUri, string content, CancellationToken cancellationToken) => cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobAsync(blobAbsoluteUri, content, ct), cancellationToken); /// - public Task UploadBlobFromFileAsync(Uri blobAbsoluteUri, string filePath, CancellationToken cancellationToken) - { - return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobFromFileAsync(blobAbsoluteUri, filePath, ct), cancellationToken); - } + public Task UploadBlobFromFileAsync(Uri blobAbsoluteUri, string filePath, CancellationToken cancellationToken) => cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.UploadBlobFromFileAsync(blobAbsoluteUri, filePath, ct), cancellationToken); /// - public Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) - { - return cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetBlobPropertiesAsync(blobAbsoluteUri, ct), cancellationToken); - } + public Task GetBlobPropertiesAsync(Uri blobAbsoluteUri, CancellationToken cancellationToken) => cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetBlobPropertiesAsync(blobAbsoluteUri, ct), cancellationToken); /// public bool LocalFileExists(string path) => azureProxy.LocalFileExists(path); @@ -334,31 +258,21 @@ public Task DeleteBatchPoolIfExistsAsync(string poolId, CancellationToken cancel /// public Task GetFullAllocationStateAsync(string poolId, CancellationToken cancellationToken) - { - return cachingAsyncRetry.ExecuteWithRetryAndCachingAsync( + => cachingAsyncRetry.ExecuteWithRetryAndCachingAsync( $"{nameof(CachingWithRetriesAzureProxy)}:{poolId}", ct => azureProxy.GetFullAllocationStateAsync(poolId, ct), - DateTimeOffset.Now.Add(BatchPoolService.RunInterval).Subtract(TimeSpan.FromSeconds(1)), - cancellationToken); - } + DateTimeOffset.Now.Add(BatchPoolService.RunInterval).Subtract(TimeSpan.FromSeconds(1)), cancellationToken); /// - public IAsyncEnumerable ListComputeNodesAsync(string poolId, DetailLevel detailLevel) - { - return cachingAsyncRetry.ExecuteWithRetryAsync(() => azureProxy.ListComputeNodesAsync(poolId, detailLevel), cachingRetry); - } + public IAsyncEnumerable ListComputeNodesAsync(string poolId, DetailLevel detailLevel) => cachingAsyncRetry.ExecuteWithRetryAsync(() => azureProxy.ListComputeNodesAsync(poolId, detailLevel), cachingRetry); /// - public IAsyncEnumerable ListTasksAsync(string jobId, DetailLevel detailLevel) - { - return cachingAsyncRetry.ExecuteWithRetryAsync(() => azureProxy.ListTasksAsync(jobId, detailLevel), cachingRetry); - } + public IAsyncEnumerable ListTasksAsync(string jobId, DetailLevel detailLevel) => cachingAsyncRetry.ExecuteWithRetryAsync(() => azureProxy.ListTasksAsync(jobId, detailLevel), cachingRetry); /// public Task DisableBatchPoolAutoScaleAsync(string poolId, CancellationToken cancellationToken) => azureProxy.DisableBatchPoolAutoScaleAsync(poolId, cancellationToken); /// - public Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeSpan interval, IAzureProxy.BatchPoolAutoScaleFormulaFactory formulaFactory, CancellationToken cancellationToken) - => azureProxy.EnableBatchPoolAutoScaleAsync(poolId, preemptable, interval, formulaFactory, cancellationToken); + public Task EnableBatchPoolAutoScaleAsync(string poolId, bool preemptable, TimeSpan interval, IAzureProxy.BatchPoolAutoScaleFormulaFactory formulaFactory, CancellationToken cancellationToken) => azureProxy.EnableBatchPoolAutoScaleAsync(poolId, preemptable, interval, formulaFactory, cancellationToken); } } diff --git a/src/TesApi.Web/Management/AzureProvider.cs b/src/TesApi.Web/Management/AzureProvider.cs index 6cfb421d9..163b4237a 100644 --- a/src/TesApi.Web/Management/AzureProvider.cs +++ b/src/TesApi.Web/Management/AzureProvider.cs @@ -34,7 +34,7 @@ protected AzureProvider(CachingRetryHandler cachingRetryHandler, AzureManagement .DefaultRetryPolicyBuilder() .SetOnRetryBehavior(this.Logger) .AddCaching() - .BuildAsync(); + .AsyncBuild(); } /// From ddb428ec37a9f0f826b51e613c49c9ec4eb518c7 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 1 Dec 2023 17:14:03 -0800 Subject: [PATCH 105/202] formatting --- src/TesApi.Web/CachingWithRetriesAzureProxy.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index fe07f6253..2e1fd817e 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -199,11 +199,11 @@ public async Task GetStorageAccountInfoAsync(string storageA var cacheKey = $"{nameof(CachingWithRetriesAzureProxy)}:{storageAccountName}"; var storageAccountInfo = cachingAsyncRetry.AppCache.Get(cacheKey); - if (storageAccountInfo is null ) + if (storageAccountInfo is null) { storageAccountInfo = await cachingAsyncRetry.ExecuteWithRetryAsync(ct => azureProxy.GetStorageAccountInfoAsync(storageAccountName, ct), cancellationToken); - if (storageAccountInfo is not null ) + if (storageAccountInfo is not null) { cachingAsyncRetry.AppCache.Set(cacheKey, storageAccountInfo, DateTimeOffset.MaxValue); } From c2def55d046d591c2b90a954b16e64cfbeb1e8e6 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 11 Dec 2023 09:09:42 -0800 Subject: [PATCH 106/202] Order process logs (err then out), in order --- src/Tes.Runner/Logs/AppendBlobLogPublisher.cs | 4 ++-- src/TesApi.Web/Events/RunnerEventsProcessor.cs | 11 +++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/Tes.Runner/Logs/AppendBlobLogPublisher.cs b/src/Tes.Runner/Logs/AppendBlobLogPublisher.cs index 2170f9c55..6413ba538 100644 --- a/src/Tes.Runner/Logs/AppendBlobLogPublisher.cs +++ b/src/Tes.Runner/Logs/AppendBlobLogPublisher.cs @@ -13,7 +13,7 @@ namespace Tes.Runner.Logs public class AppendBlobLogPublisher : StreamLogReader { private readonly string targetUrl; - private readonly BlobApiHttpUtils blobApiHttpUtils = new BlobApiHttpUtils(); + private readonly BlobApiHttpUtils blobApiHttpUtils = new(); private readonly string stdOutLogNamePrefix; private readonly string stdErrLogNamePrefix; private readonly ILogger logger = PipelineLoggerFactory.Create(); @@ -36,7 +36,7 @@ public AppendBlobLogPublisher(string targetUrl, string logNamePrefix) stdErrLogNamePrefix = $"{logNamePrefix}_stderr_{prefixTimeStamp}"; } - private string GetBlobNameConsideringBlockCountCurrentState(int blockCount, string logName) + private static string GetBlobNameConsideringBlockCountCurrentState(int blockCount, string logName) { var blobNumber = blockCount / BlobSizeUtils.MaxBlobBlocksCount; diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index f3fb6fb42..6cb532018 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -388,9 +388,16 @@ async IAsyncEnumerable GetProcessLogs(Tes.Runner.Events.EventMessage mes yield break; } + // See: Tes.Runner.Logs.AppendBlobLogPublisher constructor and GetBlobNameConsideringBlockCountCurrentState() + // There will be two or three underlines in the last path segment of each blob name, and the names will always have a ".txt" extension. + // If there are three underlines, between the last underline and the extension is an incrementing int. If not, the file is the first (and possibly only). + await foreach (var uri in azureProxy.ListBlobsAsync(await storageAccessProvider.GetInternalTesTaskBlobUrlAsync(tesTask, string.Empty, Azure.Storage.Sas.BlobSasPermissions.List, cancellationToken), cancellationToken) - .Where(blob => blob.BlobName.EndsWith(".txt") && blob.BlobName.Split('/').Last().StartsWith(blobNameStartsWith)) - .OrderBy(blob => blob.BlobName) // Not perfect ordering, but reasonable. The final results are more likely to be interpreted by people rather then machines. Perfect would involve regex. + .Select(blob => (blob.BlobUri, BlobName: blob.BlobName.Split('/').Last())) + .Where(blob => blob.BlobName.EndsWith(".txt") && blob.BlobName.StartsWith(blobNameStartsWith)) + .Select(blob => (blob.BlobUri, BlobNameParts: blob.BlobName.Split('_', 4))) + .OrderBy(blob => string.Join('_', blob.BlobNameParts.Take(3))) + .ThenBy(blob => blob.BlobNameParts.Length < 3 ? -1 : int.Parse(blob.BlobNameParts[3][..blob.BlobNameParts[3].IndexOf('.')], System.Globalization.CultureInfo.InvariantCulture)) .Select(blob => blob.BlobUri) .WithCancellation(cancellationToken)) { From fb20bff061dad9e5258d885fa9cfee2484577385 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 11 Dec 2023 09:10:57 -0800 Subject: [PATCH 107/202] WIP temporary marker --- src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs b/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs index 5cd8d447e..c31f7fadb 100644 --- a/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs +++ b/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs @@ -73,6 +73,7 @@ public async Task PrepareBatchScriptAsync(TesTask tesTask } } + /// TODO: Return this to private. // TODO: Delete this line public async Task TryUploadServerTesTask(TesTask tesTask, string blobName, CancellationToken cancellationToken) { try From 99f5e48f3466e1397daaf2f213d2aa66880fb388 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 11 Dec 2023 18:35:29 -0800 Subject: [PATCH 108/202] Unit test multiple new pool quota checks --- src/TesApi.Tests/BatchSchedulerTests.cs | 227 ++++++++++++++---- src/TesApi.Web/BatchScheduler.cs | 37 ++- .../Extensions/EmumerableExtensions.cs | 62 +++++ 3 files changed, 270 insertions(+), 56 deletions(-) create mode 100644 src/TesApi.Web/Extensions/EmumerableExtensions.cs diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index a8edb6ab3..aec6d29f4 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -319,7 +319,7 @@ public async Task TesTaskRemainsQueuedWhenBatchQuotaIsTemporarilyUnavailable() Assert.AreEqual(TesState.QUEUEDEnum, await GetNewTesTaskStateAsync(new TesResources { CpuCores = 1, RamGb = 1, Preemptible = false }, azureProxyReturnValues)); } - private async Task AddBatchTaskHandlesExceptions(TesState newState, Func, Action>)> testArranger, Action> resultValidator) + private async Task AddBatchTasksHandlesExceptions(TesState? newState, Func, Action>)> testArranger, Action> resultValidator, int numberOfTasks = 1) { var logger = new Mock>(); var azureProxyReturnValues = AzureProxyReturnValues.Defaults; @@ -329,11 +329,24 @@ private async Task AddBatchTaskHandlesExceptions(TesState newState, Func + { + var task = GetTesTask(); + task.State = TesState.QUEUEDEnum; + + if (numberOfTasks > 1) + { + task.Id = Guid.NewGuid().ToString("D"); + task.Resources.BackendParameters ??= new(); + task.Resources.BackendParameters.Add("vm_size", index % 2 == 1 ? "VmSizeDedicated1" : "VmSizeDedicated2"); + } + + return task; + }).ToArray(); + + _ = await ProcessTesTasksAndGetBatchJobArgumentsAsync( + tasks, GetMockConfig()(), azureProxy, azureProxyReturnValues, @@ -343,30 +356,37 @@ private async Task AddBatchTaskHandlesExceptions(TesState newState, Func logger.Object); }); - GuardAssertsWithTesTask(task, () => + foreach (var task in tasks) { - Assert.AreEqual(newState, task.State); - resultValidator?.Invoke(task, logger.Invocations.Where(i => nameof(ILogger.Log).Equals(i.Method.Name)).Select(i => (((LogLevel?)i.Arguments[0]) ?? LogLevel.None, (Exception)i.Arguments[3]))); - }); + GuardAssertsWithTesTask(task, () => + { + if (newState.HasValue) + { + Assert.AreEqual(newState, task.State); + } + + resultValidator?.Invoke(task, logger.Invocations.Where(i => nameof(ILogger.Log).Equals(i.Method.Name)).Select(i => (((LogLevel?)i.Arguments[0]) ?? LogLevel.None, i.Arguments[3] as Exception, i.Arguments[2].ToString()))); + }); + } } [TestMethod] public Task AddBatchTaskHandlesAzureBatchPoolCreationExceptionViaJobCreation() { - return AddBatchTaskHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); + return AddBatchTasksHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchJobAsync(It.IsAny(), It.IsAny())) .Callback((_1, _2) => throw new Microsoft.Rest.Azure.CloudException("No job for you.") { Body = new() { Code = BatchErrorCodeStrings.OperationTimedOut } })); - void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception exception)> logs) + void Validator(TesTask tesTask, IEnumerable<(LogLevel, Exception, string)> logs) { GuardAssertsWithTesTask(tesTask, () => { var log = logs.LastOrDefault(); Assert.IsNotNull(log); - var (logLevel, exception) = log; + var (logLevel, exception, _) = log; Assert.AreEqual(LogLevel.Warning, logLevel); Assert.IsInstanceOfType(exception); }); @@ -376,20 +396,20 @@ void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception except [TestMethod] public Task AddBatchTaskHandlesAzureBatchPoolCreationExceptionViaPoolCreation() { - return AddBatchTaskHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); + return AddBatchTasksHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) .Callback((_1, _2, _3) => throw new Microsoft.Rest.Azure.CloudException("No job for you.") { Body = new() { Code = BatchErrorCodeStrings.OperationTimedOut } })); - void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception exception)> logs) + void Validator(TesTask tesTask, IEnumerable<(LogLevel, Exception, string)> logs) { GuardAssertsWithTesTask(tesTask, () => { var log = logs.LastOrDefault(); Assert.IsNotNull(log); - var (logLevel, exception) = log; + var (logLevel, exception, _) = log; Assert.AreEqual(LogLevel.Warning, logLevel); Assert.IsInstanceOfType(exception); }); @@ -399,13 +419,12 @@ void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception except [TestMethod] public Task AddBatchTaskHandlesAzureBatchQuotaMaxedOutException() { - var quotaVerifier = new Mock(); - return AddBatchTaskHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); + return AddBatchTasksHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) => (services => services.AddSingleton(), default); - void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception exception)> logs) + void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception, string)> logs) { GuardAssertsWithTesTask(tesTask, () => { @@ -416,22 +435,119 @@ void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception except } } + [TestMethod] + public async Task MultipleTaskAddBatchTaskHandlesAzureBatchQuotaMaxedOutException() + { + var quotaDelayedTasks = 0; + var queuedTasks = 0; + + await AddBatchTasksHandlesExceptions(null, Arranger, Validator, 4); + + Assert.AreEqual(2, queuedTasks); + Assert.AreEqual(2, quotaDelayedTasks); + + (Action, Action>) Arranger(AzureProxyReturnValues _1) + => (services => services.AddSingleton(), default); + + void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception exception, string message)> logs) + { + GuardAssertsWithTesTask(tesTask, () => + { + switch (tesTask.State) + { + case TesState.QUEUEDEnum: + { + var log = logs.LastOrDefault(l => l.message.Contains(tesTask.Id)); + Assert.IsNotNull(log); + Assert.AreEqual(LogLevel.Warning, log.logLevel); + Assert.IsNull(log.exception); + Assert.IsTrue(log.message.Contains(nameof(AzureBatchQuotaMaxedOutException))); + } + ++quotaDelayedTasks; + break; + + case TesState.INITIALIZINGEnum: + { + var log = tesTask.Logs?.LastOrDefault(); + Assert.IsNotNull(log); + Assert.IsNotNull(log.VirtualMachineInfo); + Assert.IsNotNull(log.VirtualMachineInfo.VmSize); + } + ++queuedTasks; + break; + + default: + Assert.Fail(); + break; + } + }); + } + } + + [TestMethod] + public async Task MultipleTaskAddBatchTaskMultiplePoolsAdded() + { + var quotaDelayedTasks = 0; + var queuedTasks = 0; + + await AddBatchTasksHandlesExceptions(null, Arranger, Validator, 4); + + Assert.AreEqual(4, queuedTasks); + Assert.AreEqual(0, quotaDelayedTasks); + + (Action, Action>) Arranger(AzureProxyReturnValues _1) + => (services => services.AddSingleton(), default); + + void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception exception, string message)> logs) + { + GuardAssertsWithTesTask(tesTask, () => + { + switch (tesTask.State) + { + case TesState.QUEUEDEnum: + { + var log = logs.LastOrDefault(l => l.message.Contains(tesTask.Id)); + Assert.IsNotNull(log); + Assert.AreEqual(LogLevel.Warning, log.logLevel); + Assert.IsNull(log.exception); + Assert.IsTrue(log.message.Contains(nameof(AzureBatchQuotaMaxedOutException))); + } + ++quotaDelayedTasks; + break; + + case TesState.INITIALIZINGEnum: + { + var log = tesTask.Logs?.LastOrDefault(); + Assert.IsNotNull(log); + Assert.IsNotNull(log.VirtualMachineInfo); + Assert.IsNotNull(log.VirtualMachineInfo.VmSize); + } + ++queuedTasks; + break; + + default: + Assert.Fail(); + break; + } + }); + } + } + [TestMethod] public Task AddBatchTaskHandlesAzureBatchLowQuotaException() { - var quotaVerifier = new Mock(); - return AddBatchTaskHandlesExceptions(TesState.SYSTEMERROREnum, Arranger, Validator); + return AddBatchTasksHandlesExceptions(TesState.SYSTEMERROREnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) => (services => services.AddSingleton(), default); - void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception exception)> logs) + void Validator(TesTask tesTask, IEnumerable<(LogLevel, Exception, string)> logs) { GuardAssertsWithTesTask(tesTask, () => { var log = logs.LastOrDefault(); Assert.IsNotNull(log); - var (logLevel, exception) = log; + var (logLevel, exception, _) = log; Assert.AreEqual(LogLevel.Error, logLevel); Assert.IsInstanceOfType(exception); }); @@ -441,7 +557,7 @@ void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception except [TestMethod] public Task AddBatchTaskHandlesAzureBatchVirtualMachineAvailabilityException() { - return AddBatchTaskHandlesExceptions(TesState.SYSTEMERROREnum, Arranger, Validator); + return AddBatchTasksHandlesExceptions(TesState.SYSTEMERROREnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues proxy) { @@ -449,13 +565,13 @@ public Task AddBatchTaskHandlesAzureBatchVirtualMachineAvailabilityException() return (default, default); } - void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception exception)> logs) + void Validator(TesTask tesTask, IEnumerable<(LogLevel, Exception, string)> logs) { GuardAssertsWithTesTask(tesTask, () => { var log = logs.LastOrDefault(); Assert.IsNotNull(log); - var (logLevel, exception) = log; + var (logLevel, exception, _) = log; Assert.AreEqual(LogLevel.Error, logLevel); Assert.IsInstanceOfType(exception); }); @@ -465,20 +581,20 @@ void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception except [TestMethod] public Task AddBatchTaskHandlesTesException() { - return AddBatchTaskHandlesExceptions(TesState.SYSTEMERROREnum, Arranger, Validator); + return AddBatchTasksHandlesExceptions(TesState.SYSTEMERROREnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) .Callback((poolInfo, isPreemptible, cancellationToken) => throw new TesException("TestFailureReason"))); - void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception exception)> logs) + void Validator(TesTask tesTask, IEnumerable<(LogLevel, Exception, string)> logs) { GuardAssertsWithTesTask(tesTask, () => { var log = logs.LastOrDefault(); Assert.IsNotNull(log); - var (logLevel, exception) = log; + var (logLevel, exception, _) = log; Assert.AreEqual(LogLevel.Error, logLevel); Assert.IsInstanceOfType(exception); }); @@ -488,7 +604,7 @@ void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception except [TestMethod] public Task AddBatchTaskHandlesBatchClientException() { - return AddBatchTaskHandlesExceptions(TesState.SYSTEMERROREnum, Arranger, Validator); + return AddBatchTasksHandlesExceptions(TesState.SYSTEMERROREnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) => (default, azureProxy => azureProxy.Setup(b => b.AddBatchTaskAsync(It.IsAny(), It.IsAny(), It.IsAny(), It.IsAny())) @@ -498,13 +614,13 @@ public Task AddBatchTaskHandlesBatchClientException() new[] { typeof(string), typeof(Exception) }) .Invoke(new object[] { null, null }) as Exception)); - void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception exception)> logs) + void Validator(TesTask tesTask, IEnumerable<(LogLevel, Exception, string)> logs) { GuardAssertsWithTesTask(tesTask, () => { var log = logs.LastOrDefault(); Assert.IsNotNull(log); - var (logLevel, exception) = log; + var (logLevel, exception, _) = log; Assert.AreEqual(LogLevel.Error, logLevel); Assert.IsInstanceOfType(exception); }); @@ -514,7 +630,7 @@ void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception except [TestMethod] public Task AddBatchTaskHandlesBatchExceptionForJobQuota() { - return AddBatchTaskHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); + return AddBatchTasksHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchJobAsync(It.IsAny(), It.IsAny())) @@ -524,7 +640,7 @@ public Task AddBatchTaskHandlesBatchExceptionForJobQuota() default, new Microsoft.Azure.Batch.Protocol.Models.BatchErrorException() { Body = new() { Code = "ActiveJobAndScheduleQuotaReached", Message = new(value: "No job for you.") } }))); - void Validator(TesTask task, IEnumerable<(LogLevel logLevel, Exception exception)> logs) + void Validator(TesTask task, IEnumerable<(LogLevel logLevel, Exception, string)> logs) { GuardAssertsWithTesTask(task, () => { @@ -539,7 +655,7 @@ void Validator(TesTask task, IEnumerable<(LogLevel logLevel, Exception exception [TestMethod] public Task AddBatchTaskHandlesBatchExceptionForPoolQuota() { - return AddBatchTaskHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); + return AddBatchTasksHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) @@ -549,7 +665,7 @@ public Task AddBatchTaskHandlesBatchExceptionForPoolQuota() default, new Microsoft.Azure.Batch.Protocol.Models.BatchErrorException() { Body = new() { Code = "PoolQuotaReached", Message = new(value: "No pool for you.") } }))); - void Validator(TesTask task, IEnumerable<(LogLevel logLevel, Exception exception)> logs) + void Validator(TesTask task, IEnumerable<(LogLevel logLevel, Exception, string)> logs) { GuardAssertsWithTesTask(task, () => { @@ -564,14 +680,14 @@ void Validator(TesTask task, IEnumerable<(LogLevel logLevel, Exception exception [TestMethod] public Task AddBatchTaskHandlesCloudExceptionForPoolQuota() { - return AddBatchTaskHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); + return AddBatchTasksHandlesExceptions(TesState.QUEUEDEnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) => (default, azureProxy => azureProxy.Setup(b => b.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny())) .Callback((poolInfo, isPreemptible, cancellationToken) => throw new Microsoft.Rest.Azure.CloudException() { Body = new() { Code = "AutoPoolCreationFailedWithQuotaReached", Message = "No autopool for you." } })); - void Validator(TesTask task, IEnumerable<(LogLevel logLevel, Exception exception)> logs) + void Validator(TesTask task, IEnumerable<(LogLevel logLevel, Exception, string)> logs) { GuardAssertsWithTesTask(task, () => { @@ -589,18 +705,18 @@ public Task AddBatchTaskHandlesUnknownException() var exceptionMsg = "Successful Test"; var batchQuotaProvider = new Mock(); batchQuotaProvider.Setup(p => p.GetVmCoreQuotaAsync(It.IsAny(), It.IsAny())).Callback((lowPriority, _1) => throw new InvalidOperationException(exceptionMsg)); - return AddBatchTaskHandlesExceptions(TesState.SYSTEMERROREnum, Arranger, Validator); + return AddBatchTasksHandlesExceptions(TesState.SYSTEMERROREnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) => (services => services.AddTransient(p => batchQuotaProvider.Object), default); - void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception exception)> logs) + void Validator(TesTask tesTask, IEnumerable<(LogLevel, Exception, string)> logs) { GuardAssertsWithTesTask(tesTask, () => { var log = logs.LastOrDefault(); Assert.IsNotNull(log); - var (logLevel, exception) = log; + var (logLevel, exception, _) = log; Assert.AreEqual(LogLevel.Error, logLevel); Assert.IsInstanceOfType(exception); Assert.AreEqual(exceptionMsg, exception.Message); @@ -1557,7 +1673,10 @@ public async Task PoolIsCreatedWithoutPublicIpWhenSubnetAndDisableBatchNodesPubl private static Task<(string JobId, CloudTask CloudTask, PoolInformation PoolInformation, Pool batchModelsPool)> ProcessTesTaskAndGetBatchJobArgumentsAsync() => ProcessTesTaskAndGetBatchJobArgumentsAsync(GetTesTask(), GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); - private static async Task<(string JobId, CloudTask CloudTask, PoolInformation PoolInformation, Pool batchModelsPool)> ProcessTesTaskAndGetBatchJobArgumentsAsync(TesTask tesTask, IEnumerable<(string Key, string Value)> configuration, Action> azureProxy, AzureProxyReturnValues azureProxyReturnValues, Action additionalActions = default, Action> serviceProviderActions = default) + private static Task<(string JobId, CloudTask CloudTask, PoolInformation PoolInformation, Pool batchModelsPool)> ProcessTesTaskAndGetBatchJobArgumentsAsync(TesTask tesTask, IEnumerable<(string Key, string Value)> configuration, Action> azureProxy, AzureProxyReturnValues azureProxyReturnValues, Action additionalActions = default, Action> serviceProviderActions = default) + => ProcessTesTasksAndGetBatchJobArgumentsAsync(new[] { tesTask }, configuration, azureProxy, azureProxyReturnValues, additionalActions, serviceProviderActions); + + private static async Task<(string JobId, CloudTask CloudTask, PoolInformation PoolInformation, Pool batchModelsPool)> ProcessTesTasksAndGetBatchJobArgumentsAsync(TesTask[] tesTasks, IEnumerable<(string Key, string Value)> configuration, Action> azureProxy, AzureProxyReturnValues azureProxyReturnValues, Action additionalActions = default, Action> serviceProviderActions = default) { using var serviceProvider = GetServiceProvider( configuration, @@ -1571,11 +1690,11 @@ public async Task PoolIsCreatedWithoutPublicIpWhenSubnetAndDisableBatchNodesPubl if (azureProxyReturnValues.BatchTaskState is null) { - await foreach (var _ in batchScheduler.ProcessQueuedTesTasksAsync(new[] { tesTask }, CancellationToken.None)) { } + await foreach (var _ in batchScheduler.ProcessQueuedTesTasksAsync(tesTasks, CancellationToken.None)) { } } else { - await foreach (var _ in batchScheduler.ProcessTesTaskBatchStatesAsync(new[] { tesTask }, new[] { azureProxyReturnValues.BatchTaskState }, CancellationToken.None)) { } + await foreach (var _ in batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, Enumerable.Repeat(azureProxyReturnValues.BatchTaskState, tesTasks.Length).ToArray(), CancellationToken.None)) { } } var createBatchPoolAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.CreateBatchPoolAsync)); @@ -1948,6 +2067,28 @@ internal CloudPool GetBatchPoolImpl(string poolId) } } + private class TestMultitaskBatchQuotaVerifierQuotaMaxedOut : TestBatchQuotaVerifierBase + { + public TestMultitaskBatchQuotaVerifierQuotaMaxedOut(IBatchQuotaProvider batchQuotaProvider) : base(batchQuotaProvider) { } + + public override Task CheckBatchAccountPoolAndJobQuotasAsync(int required, CancellationToken cancellationToken) + => Task.FromResult(new CheckGroupPoolAndJobQuotaResult(required / 2, new AzureBatchQuotaMaxedOutException("Test AzureBatchQuotaMaxedOutException"))); + + public override Task CheckBatchAccountQuotasAsync(VirtualMachineInformation _1, bool _2, CancellationToken cancellationToken) + => Task.CompletedTask; + } + + private class TestMultitaskBatchQuotaVerifierQuotaAllAllowed : TestBatchQuotaVerifierBase + { + public TestMultitaskBatchQuotaVerifierQuotaAllAllowed(IBatchQuotaProvider batchQuotaProvider) : base(batchQuotaProvider) { } + + public override Task CheckBatchAccountPoolAndJobQuotasAsync(int required, CancellationToken cancellationToken) + => Task.FromResult(new CheckGroupPoolAndJobQuotaResult(0, null)); + + public override Task CheckBatchAccountQuotasAsync(VirtualMachineInformation _1, bool _2, CancellationToken cancellationToken) + => Task.CompletedTask; + } + private class TestBatchQuotaVerifierQuotaMaxedOut : TestBatchQuotaVerifierBase { public TestBatchQuotaVerifierQuotaMaxedOut(IBatchQuotaProvider batchQuotaProvider) : base(batchQuotaProvider) { } diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 441bbe2ca..db5c22fa8 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -627,29 +627,40 @@ await Parallel.ForEachAsync(tesTasks, cancellationToken, async (tesTask, token) results.Clear(); - // Determine how many nodes in each new pool we might need for this group. + // Determine how many nodes in each possibly new pool we might need for this group of tasks. var neededPoolNodesByPoolKey = tasksMetadataByPoolKey.ToDictionary(t => t.Key, t => t.Value.Length); { - // Determine how many new pools/jobs we will need for this batch - var requiredNewPools = neededPoolNodesByPoolKey.Where(t => !IsPoolAvailable(t.Key)).Count(); + // Determine how many new pools/jobs we need now + var requiredNewPools = neededPoolNodesByPoolKey.Keys.WhereNot(IsPoolAvailable).ToArray(); - // Revisit pool/job quotas (the above loop already dealt with the possiblility of needing just one more pool/job). + // Revisit pool/job quotas (the above loop already dealt with the possiblility of needing just one more pool or job). // This will remove pool keys we cannot accomodate due to quota, along with all of their associated tasks, from being queued into Batch. - if (requiredNewPools > 1) + if (requiredNewPools.Skip(1).Any()) { - for (var (excess, exception) = await quotaVerifier.CheckBatchAccountPoolAndJobQuotasAsync(requiredNewPools, cancellationToken); - excess > 0;) + bool TryRemoveKeyAndTasks(string key, out (string Key, ImmutableArray ListOfTaskMetadata) result) { - var key = tasksMetadataByPoolKey.Keys.Last(); + result = default; + if (tasksMetadataByPoolKey.TryRemove(key, out var listOfTaskMetadata)) { - foreach (var task in listOfTaskMetadata.Select(m => m.TesTask)) - { - yield return new(HandleExceptionAsync(exception, key, task), task); - } + result = (key, listOfTaskMetadata); + return true; + } + + return false; + } + + var (excess, exception) = await quotaVerifier.CheckBatchAccountPoolAndJobQuotasAsync(requiredNewPools.Length, cancellationToken); - excess--; + foreach (var (key, listOfTaskMetadata) in requiredNewPools + .Reverse() // TODO: do we want to favor earlier or later tasks? + .SelectWhere)>(TryRemoveKeyAndTasks) + .Take(excess)) + { + foreach (var task in listOfTaskMetadata.Select(m => m.TesTask)) + { + yield return new(HandleExceptionAsync(exception, key, task), task); } } } diff --git a/src/TesApi.Web/Extensions/EmumerableExtensions.cs b/src/TesApi.Web/Extensions/EmumerableExtensions.cs new file mode 100644 index 000000000..d548b5e78 --- /dev/null +++ b/src/TesApi.Web/Extensions/EmumerableExtensions.cs @@ -0,0 +1,62 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System; +using System.Collections.Generic; +using System.Linq; + +namespace TesApi.Web.Extensions +{ + /// + /// extensions + /// + public static class EmumerableExtensions + { + /// + /// Filters a sequence of values based on a false predicate. + /// + /// The type of the elements of . + /// An to filter. + /// A function to test each element for a false condition. + /// An that contains elements from that do not satisfy the condition. + /// or is null. + public static IEnumerable WhereNot(this IEnumerable source, Func predicate) + { + ArgumentNullException.ThrowIfNull(predicate); + + return source.Where(element => !predicate(element)); + } + + /// + /// Projects each element of a sequence into a new form for each element where a specified condition is true. + /// + /// The type of the elements of . + /// The type of the value returned by . + /// A sequence of values to test and transform. + /// A function to apply to each element which will test for a condition and provide a transformed value. + /// An whose elements are the result of invoking the transform function on each element that satisfies the condition. + public static IEnumerable SelectWhere(this IEnumerable source, PredicateSelector predicateSelector) + { + ArgumentNullException.ThrowIfNull(source); + ArgumentNullException.ThrowIfNull(predicateSelector); + + foreach (var element in source) + { + if (predicateSelector(element, out var result)) + { + yield return result; + } + } + } + + /// + /// A combined predicate and selector for . + /// + /// The type of . + /// The type of . + /// Source value. + /// Transformed value. + /// True if the source is transformed. + public delegate bool PredicateSelector(TSource source, out TResult result); + } +} From 98e6291d2aba7f23ffc8de84d5fd793ce3df0930 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 15 Dec 2023 18:55:59 -0800 Subject: [PATCH 109/202] formatting --- src/GenerateBatchVmSkus/Program.cs | 2 +- src/Tes.ApiClients/HttpApiClient.cs | 2 +- src/Tes.ApiClients/TerraWsmApiClient.cs | 2 +- src/Tes.Runner/Docker/DockerExecutor.cs | 4 ++-- src/TesApi.Web/Startup.cs | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/GenerateBatchVmSkus/Program.cs b/src/GenerateBatchVmSkus/Program.cs index c4ea8e082..d91253ef1 100644 --- a/src/GenerateBatchVmSkus/Program.cs +++ b/src/GenerateBatchVmSkus/Program.cs @@ -3,7 +3,6 @@ using System.Reflection; using System.Text.Json; -using CommonUtilities.Options; using Azure; using Azure.Core; using Azure.Identity; @@ -12,6 +11,7 @@ using Azure.ResourceManager.Compute; using Azure.ResourceManager.Compute.Models; using Azure.ResourceManager.Resources.Models; +using CommonUtilities.Options; using Microsoft.Extensions.Caching.Memory; using Microsoft.Extensions.Configuration; using Microsoft.Extensions.Logging.Abstractions; diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index 34033880e..c478e07c0 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -5,8 +5,8 @@ using System.Security.Cryptography; using System.Text; using System.Text.Json; -using CommonUtilities; using Azure.Core; +using CommonUtilities; using Microsoft.Extensions.Logging; namespace Tes.ApiClients diff --git a/src/Tes.ApiClients/TerraWsmApiClient.cs b/src/Tes.ApiClients/TerraWsmApiClient.cs index db369f5b3..42be2802a 100644 --- a/src/Tes.ApiClients/TerraWsmApiClient.cs +++ b/src/Tes.ApiClients/TerraWsmApiClient.cs @@ -4,8 +4,8 @@ using System.Text; using System.Text.Json; using System.Text.Json.Serialization; -using CommonUtilities.Options; using Azure.Core; +using CommonUtilities.Options; using Microsoft.Extensions.Caching.Memory; using Microsoft.Extensions.Logging; using Tes.ApiClients.Models.Terra; diff --git a/src/Tes.Runner/Docker/DockerExecutor.cs b/src/Tes.Runner/Docker/DockerExecutor.cs index d8f1773e2..e973527e6 100644 --- a/src/Tes.Runner/Docker/DockerExecutor.cs +++ b/src/Tes.Runner/Docker/DockerExecutor.cs @@ -1,12 +1,12 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +using CommonUtilities; +using CommonUtilities.Options; using Docker.DotNet; using Docker.DotNet.Models; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; -using CommonUtilities; -using CommonUtilities.Options; using Tes.Runner.Authentication; using Tes.Runner.Logs; using Tes.Runner.Models; diff --git a/src/TesApi.Web/Startup.cs b/src/TesApi.Web/Startup.cs index 2219e1096..e07b33533 100644 --- a/src/TesApi.Web/Startup.cs +++ b/src/TesApi.Web/Startup.cs @@ -4,10 +4,10 @@ using System; using System.IO; using System.Reflection; -using CommonUtilities; -using CommonUtilities.Options; using Azure.Core; using Azure.Identity; +using CommonUtilities; +using CommonUtilities.Options; using Microsoft.AspNetCore.Builder; using Microsoft.AspNetCore.Hosting; using Microsoft.Extensions.Caching.Memory; From cf7fa12f51a44174a9df2649541d1aa20d87f887 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 18 Dec 2023 13:07:08 -0800 Subject: [PATCH 110/202] Split builders and implementers --- src/CommonUtilities.Tests/Base32Tests.cs | 2 +- .../PagedInterfaceExtensions.cs | 21 +- src/CommonUtilities/RetryHandler.cs | 669 +----------------- src/CommonUtilities/RetryPolicyBuilder.cs | 647 +++++++++++++++++ .../{Base32.cs => UtilityExtensions.cs} | 33 +- src/GenerateBatchVmSkus/Program.cs | 2 +- .../CacheAndRetryHandlerTest.cs | 2 +- .../PriceApiClientTests.cs | 4 +- .../TerraWsmApiClientIntegrationTests.cs | 2 +- .../TerraWsmApiClientTests.cs | 24 +- .../TestServices/RetryHandlersHelpers.cs | 11 +- src/Tes.ApiClients/CachingRetryHandler.cs | 161 +---- .../CachingRetryPolicyBuilder.cs | 158 +++++ src/Tes.ApiClients/HttpApiClient.cs | 8 +- src/Tes.ApiClients/PriceApiClient.cs | 4 +- src/Tes.ApiClients/TerraApiClient.cs | 7 +- src/Tes.ApiClients/TerraWsmApiClient.cs | 14 +- src/Tes.Runner/Docker/DockerExecutor.cs | 10 +- ...riceApiBatchSkuInformationProviderTests.cs | 4 +- .../TestServices/TestServiceProvider.cs | 2 +- src/TesApi.Web/AzureProxy.cs | 12 +- src/TesApi.Web/BatchScheduler.BatchPools.cs | 6 +- .../CachingWithRetriesAzureProxy.cs | 11 +- src/TesApi.Web/Startup.cs | 4 +- src/deploy-tes-on-azure/Deployer.cs | 2 +- 25 files changed, 923 insertions(+), 897 deletions(-) create mode 100644 src/CommonUtilities/RetryPolicyBuilder.cs rename src/CommonUtilities/{Base32.cs => UtilityExtensions.cs} (64%) create mode 100644 src/Tes.ApiClients/CachingRetryPolicyBuilder.cs diff --git a/src/CommonUtilities.Tests/Base32Tests.cs b/src/CommonUtilities.Tests/Base32Tests.cs index d9fcca3b2..6b8dfa652 100644 --- a/src/CommonUtilities.Tests/Base32Tests.cs +++ b/src/CommonUtilities.Tests/Base32Tests.cs @@ -17,6 +17,6 @@ public class Base32Tests [DataRow("fooba", "MZXW6YTB")] [DataRow("foobar", "MZXW6YTBOI======")] public void ValidateConvertToBase32(string data, string expected) - => Assert.AreEqual(expected, Base32.ConvertToBase32(Encoding.UTF8.GetBytes(data))); + => Assert.AreEqual(expected, Encoding.UTF8.GetBytes(data).ConvertToBase32()); } } diff --git a/src/CommonUtilities/PagedInterfaceExtensions.cs b/src/CommonUtilities/PagedInterfaceExtensions.cs index 1a2e23b7c..07891f8fb 100644 --- a/src/CommonUtilities/PagedInterfaceExtensions.cs +++ b/src/CommonUtilities/PagedInterfaceExtensions.cs @@ -3,6 +3,7 @@ using Microsoft.Azure.Management.ResourceManager.Fluent.Core; using Microsoft.Rest.Azure; +using static CommonUtilities.RetryHandler; namespace CommonUtilities { @@ -42,13 +43,13 @@ public static IAsyncEnumerable ToAsyncEnumerable(this IPage source, Fun /// Policy retrying call to . /// Name of method originating the retriable operation. /// - public static IAsyncEnumerable ExecuteWithRetryAsync(this RetryHandler.AsyncRetryHandlerPolicy asyncRetryPolicy, Func> func, RetryHandler.RetryHandlerPolicy retryPolicy, [System.Runtime.CompilerServices.CallerMemberName] string? caller = default) // ExecuteWithRetryAsync + public static IAsyncEnumerable ExecuteWithRetryAsync(this AsyncRetryHandlerPolicy asyncRetryPolicy, Func> func, RetryHandlerPolicy retryPolicy, [System.Runtime.CompilerServices.CallerMemberName] string? caller = default) { ArgumentNullException.ThrowIfNull(asyncRetryPolicy); ArgumentNullException.ThrowIfNull(func); ArgumentNullException.ThrowIfNull(retryPolicy); - var ctx = RetryHandler.PrepareContext(caller); + var ctx = PrepareContext(caller); return new PollyAsyncEnumerable(retryPolicy.RetryPolicy.Execute(_ => func(), ctx), asyncRetryPolicy, ctx); } @@ -56,17 +57,17 @@ public static IAsyncEnumerable ExecuteWithRetryAsync(this RetryHandler.Asy /// Adapts calls returning to . /// /// Type of results returned in by . - /// Policy retrying calls made while enumerating results returned by . + /// Policy retrying call to and calls made while enumerating results returned by . /// Method returning . - /// + /// A for controlling the lifetime of the asynchronous operation. /// Name of method originating the retriable operation. /// - public static async ValueTask> ExecuteWithRetryAsync(this RetryHandler.AsyncRetryHandlerPolicy asyncRetryPolicy, Func>> func, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string? caller = default) // ExecuteWithRetryAsync + public static async ValueTask> ExecuteWithRetryAsync(this AsyncRetryHandlerPolicy asyncRetryPolicy, Func>> func, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string? caller = default) { ArgumentNullException.ThrowIfNull(asyncRetryPolicy); ArgumentNullException.ThrowIfNull(func); - var ctx = RetryHandler.PrepareContext(caller); + var ctx = PrepareContext(caller); return new PollyAsyncEnumerable(await asyncRetryPolicy.RetryPolicy.ExecuteAsync((_, ct) => func(ct).AsTask(), ctx, cancellationToken), asyncRetryPolicy, ctx); } @@ -74,17 +75,17 @@ public static async ValueTask> ExecuteWithRetryAsync(this /// Adapts calls returning to . /// /// Type of results returned in by . - /// Policy retrying calls made while enumerating results returned by . + /// Policy retrying call to and calls made while enumerating results returned by . /// Method returning . - /// + /// A for controlling the lifetime of the asynchronous operation. /// Name of method originating the retriable operation. /// - public static async Task> ExecuteWithRetryAsync(this RetryHandler.AsyncRetryHandlerPolicy asyncRetryPolicy, Func>> func, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string? caller = default) // ExecuteWithRetryAsync + public static async Task> ExecuteWithRetryAsync(this AsyncRetryHandlerPolicy asyncRetryPolicy, Func>> func, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string? caller = default) { ArgumentNullException.ThrowIfNull(asyncRetryPolicy); ArgumentNullException.ThrowIfNull(func); - var ctx = RetryHandler.PrepareContext(caller); + var ctx = PrepareContext(caller); return new PollyAsyncEnumerable(await asyncRetryPolicy.RetryPolicy.ExecuteAsync((_, ct) => func(ct), ctx, cancellationToken), asyncRetryPolicy, ctx); } diff --git a/src/CommonUtilities/RetryHandler.cs b/src/CommonUtilities/RetryHandler.cs index 482a17c6e..da741fbd2 100644 --- a/src/CommonUtilities/RetryHandler.cs +++ b/src/CommonUtilities/RetryHandler.cs @@ -1,19 +1,14 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -using System.Net; -using CommonUtilities.Options; -using Microsoft.Extensions.Logging; -using Microsoft.Extensions.Options; using Polly; -using Polly.Retry; namespace CommonUtilities; /// /// Utility class that facilitates the retry policy implementations for HTTP clients. /// -public class RetryHandler +public static class RetryHandler { /// /// Polly Context key for caller method name @@ -24,52 +19,13 @@ public class RetryHandler /// public const string BackupSkipProvidedIncrementKey = $"Tes.ApiClients.{nameof(RetryHandler)}.BackupSkipProvidedIncrementCount"; - /// - /// The main starting point for building retry policies - /// - public IPolicyBuilderPolicy PolicyBuilder => new PolicyBuilderPolicy(defaultOptions); - - /// - /// The default HttpResponseMessage retry policy builder. - /// - public static PolicyBuilder DefaultHttpResponseMessagePolicyBuilder => - Polly.Extensions.Http.HttpPolicyExtensions.HandleTransientHttpError() - .OrResult(r => r.StatusCode == HttpStatusCode.TooManyRequests); - - /// Shortcut starting point for testing. Can be used in production as well. - public virtual IPolicyBuilderWait DefaultRetryPolicyBuilder() - => PolicyBuilder - .OpinionatedRetryPolicy() - .WithRetryPolicyOptionsWait(); - - /// Shortcut starting point for testing. Can be used in production as well. - public virtual IPolicyBuilderWait DefaultRetryHttpResponseMessagePolicyBuilder() - => PolicyBuilder - .OpinionatedRetryPolicy(DefaultHttpResponseMessagePolicyBuilder) - .WithRetryPolicyOptionsWait(); - - /// - /// Public constructor - /// - /// Retry policy options - public RetryHandler(IOptions retryPolicyOptions) - { - ArgumentNullException.ThrowIfNull(retryPolicyOptions); - defaultOptions = new(this, retryPolicyOptions.Value); - } - - /// - /// Parameter-less constructor for mocking - /// - protected RetryHandler() { } - #region RetryHandlerPolicies /// /// Non-generic synchronous retry policy /// public class RetryHandlerPolicy { - private readonly ISyncPolicy retryPolicy; + private readonly ISyncPolicy retryPolicy = null!; /// For extensions public ISyncPolicy RetryPolicy => retryPolicy; @@ -93,7 +49,7 @@ public RetryHandlerPolicy() { } /// /// Action to execute. /// Name of method originating the retriable operation. - public virtual void ExecuteWithRetry(Action action, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + public virtual void ExecuteWithRetry(Action action, [System.Runtime.CompilerServices.CallerMemberName] string? caller = default) { ArgumentNullException.ThrowIfNull(action); @@ -107,7 +63,7 @@ public virtual void ExecuteWithRetry(Action action, [System.Runtime.CompilerServ /// Action to execute. /// Name of method originating the retriable operation. /// instance. - public virtual TResult ExecuteWithRetry(Func action, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + public virtual TResult ExecuteWithRetry(Func action, [System.Runtime.CompilerServices.CallerMemberName] string? caller = default) { ArgumentNullException.ThrowIfNull(action); @@ -126,7 +82,7 @@ public virtual TResult ExecuteWithRetry(Func action, [System.R /// public class AsyncRetryHandlerPolicy { - private readonly IAsyncPolicy retryPolicy; + private readonly IAsyncPolicy retryPolicy = null!; /// For extensions public IAsyncPolicy RetryPolicy => retryPolicy; @@ -151,7 +107,7 @@ public AsyncRetryHandlerPolicy() { } /// Action to execute. /// Name of method originating the retriable operation. /// - public virtual Task ExecuteWithRetryAsync(Func action, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + public virtual Task ExecuteWithRetryAsync(Func action, [System.Runtime.CompilerServices.CallerMemberName] string? caller = default) { ArgumentNullException.ThrowIfNull(action); @@ -165,7 +121,7 @@ public virtual Task ExecuteWithRetryAsync(Func action, [System.Runtime.Com /// A for controlling the lifetime of the asynchronous operation. /// Name of method originating the retriable operation. /// - public virtual Task ExecuteWithRetryAsync(Func action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + public virtual Task ExecuteWithRetryAsync(Func action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string? caller = default) { ArgumentNullException.ThrowIfNull(action); @@ -179,7 +135,7 @@ public virtual Task ExecuteWithRetryAsync(Func action, /// Action to execute. /// Name of method originating the retriable operation. /// instance. - public virtual Task ExecuteWithRetryAsync(Func> action, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + public virtual Task ExecuteWithRetryAsync(Func> action, [System.Runtime.CompilerServices.CallerMemberName] string? caller = default) { ArgumentNullException.ThrowIfNull(action); @@ -194,7 +150,7 @@ public virtual Task ExecuteWithRetryAsync(Func> /// A for controlling the lifetime of the asynchronous operation. /// Name of method originating the retriable operation. /// instance. - public virtual Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + public virtual Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string? caller = default) { ArgumentNullException.ThrowIfNull(action); @@ -207,7 +163,7 @@ public virtual Task ExecuteWithRetryAsync(Func public class AsyncRetryHandlerPolicy { - private readonly IAsyncPolicy retryPolicy; + private readonly IAsyncPolicy retryPolicy = null!; /// For extensions public IAsyncPolicy RetryPolicy => retryPolicy; @@ -232,7 +188,7 @@ public AsyncRetryHandlerPolicy() { } /// Action to execute /// Name of method originating the retriable operation. /// - public virtual Task ExecuteWithRetryAsync(Func> action, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + public virtual Task ExecuteWithRetryAsync(Func> action, [System.Runtime.CompilerServices.CallerMemberName] string? caller = default) { ArgumentNullException.ThrowIfNull(action); @@ -246,7 +202,7 @@ public virtual Task ExecuteWithRetryAsync(Func> action, [ /// A for controlling the lifetime of the asynchronous operation. /// Name of method originating the retriable operation. /// - public virtual Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + public virtual Task ExecuteWithRetryAsync(Func> action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string? caller = default) { ArgumentNullException.ThrowIfNull(action); @@ -262,7 +218,7 @@ public virtual Task ExecuteWithRetryAsync(FuncA for controlling the lifetime of the asynchronous operation. /// Name of method originating the retriable operation. /// - public virtual async Task ExecuteWithRetryAndConversionAsync(Func> action, Func> convert, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + public virtual async Task ExecuteWithRetryAndConversionAsync(Func> action, Func> convert, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string? caller = default) { ArgumentNullException.ThrowIfNull(action); @@ -271,230 +227,6 @@ public virtual async Task ExecuteWithRetryAndConversionAsync(Func - /// Selects retry policy type. - /// - public interface IPolicyBuilderPolicy - { - /// - /// Default retry policy. - /// - /// retry policy builder. - IPolicyBuilderBase OpinionatedRetryPolicy(); - - /// - /// Custom retry policy. - /// - /// Builder class that holds the list of current exception predicates. - /// Custom retry policy builder. - IPolicyBuilderBase OpinionatedRetryPolicy(PolicyBuilder policyBuilder); - - /// - /// Generic retry policy. - /// - /// Result values. - /// Builder class that holds the list of current execution predicates filtering TResult result values. - /// Generic retry policy builder. - IPolicyBuilderBase OpinionatedRetryPolicy(PolicyBuilder policyBuilder); - } - - /// - /// Selects retry policy wait algorithm. - /// - public interface IPolicyBuilderBase - { - /// - /// Default wait policy. - /// - /// OnRetry hander - IPolicyBuilderWait WithRetryPolicyOptionsWait(); - - /// - /// Custom exponential wait policy. - /// - /// Maximum number of retries. - /// Value in seconds which is raised by the power of the retry attempt. - /// OnRetry hander - IPolicyBuilderWait WithExponentialBackoffWait(int maxRetryCount, double exponentialBackOffExponent); - - /// - /// Custom exception-based wait policy. - /// - /// Maximum number of retries. - /// Wait policy. - /// OnRetry hander - IPolicyBuilderWait WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func waitDurationProvider); - - /// - /// Custom optional exception-based wait policy backed up by an exponential wait policy. - /// - /// Wait policy that can return to use the backup wait policy. - /// Maximum number of retries. - /// Value in seconds which is raised by the power of the backup retry attempt. - /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. - /// OnRetry hander - IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, int maxRetryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements); - - /// - /// Custom optional exception-based wait policy backed up by the default wait policy. - /// - /// Wait policy that can return to use the backup wait policy. - /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. - /// OnRetry hander - IPolicyBuilderWait WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func waitDurationProvider, bool backupSkipProvidedIncrements); - } - - /// - /// Selects retry policy wait algorithm. - /// - public interface IPolicyBuilderBase - { - /// - /// Default wait policy. - /// - /// OnRetry hander - IPolicyBuilderWait WithRetryPolicyOptionsWait(); - - /// - /// Custom exponential wait policy. - /// - /// Maximum number of retries. - /// Value in seconds which is raised by the power of the retry attempt. - /// OnRetry hander - IPolicyBuilderWait WithExponentialBackoffWait(int maxRetryCount, double exponentialBackOffExponent); - - /// - /// Custom result-based policy. - /// - /// Maximum number of retries. - /// Wait policy. - /// Wait policy. - /// OnRetry hander - IPolicyBuilderWait WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func, TimeSpan> waitDurationProvider); - - /// - /// Custom exception-based wait policy. - /// - /// Maximum number of retries. - /// Wait policy. - /// OnRetry hander - IPolicyBuilderWait WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func waitDurationProvider); - - /// - /// Custom optional exception-based wait policy backed up by an exponential wait policy. - /// - /// Wait policy that can return to use the backup wait policy. - /// Maximum number of retries. - /// Value in seconds which is raised by the power of the backup retry attempt. - /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. - /// OnRetry hander - IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, int retryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements); - - /// - /// Custom optional exception-based wait policy backed up by the default wait policy. - /// - /// Wait policy that can return to use the backup wait policy. - /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. - /// OnRetry hander - IPolicyBuilderWait WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func waitDurationProvider, bool backupSkipProvidedIncrements); - } - - /// - /// Sets on-retry handlers. - /// - public interface IPolicyBuilderWait - { - /// - /// OnRetry behaviors - /// - /// Logger to enable retry logging. - /// Custom onretry handler. - /// Custom async onretry handler. Only applies to . - /// OnRetry builder - IPolicyBuilderBuild SetOnRetryBehavior(ILogger logger = default, OnRetryHandler onRetry = default, OnRetryHandlerAsync onRetryAsync = default); - } - - /// - /// Sets on-retry handlers. - /// - public interface IPolicyBuilderWait - { - /// - /// OnRetry behaviors - /// - /// Logger to enable retry logging. - /// Custom onretry handler. - /// Custom async onretry handler. Only applies to . - /// OnRetry builder - IPolicyBuilderBuild SetOnRetryBehavior(ILogger logger = default, OnRetryHandler onRetry = default, OnRetryHandlerAsync onRetryAsync = default); - } - - public interface IPolicyBuilderBuild - { - /// - /// Builds . - /// - /// Retry policy. - RetryHandlerPolicy SyncBuild(); - - /// - /// Builds for extensions to the builder. - /// - /// Retry policy. - ISyncPolicy SyncBuildPolicy(); - - /// - /// Builds . - /// - /// Async retry policy. - AsyncRetryHandlerPolicy AsyncBuild(); - - /// - /// Builds for extensions to the builder. - /// - /// Async retry policy. - IAsyncPolicy AsyncBuildPolicy(); - - /// - /// Retrives the instance of the retryhandler to accomodate extensions to the builder - /// - RetryHandler PolicyBuilderBase { get; } - } - - public interface IPolicyBuilderBuild - { - ///// - ///// Builds . - ///// - ///// Retry policy. - //RetryHandlerPolicy SyncBuild(); - - ///// - ///// Builds for extensions to the builder. - ///// - ///// Retry policy. - //ISyncPolicy SyncBuildPolicy(); - - /// - /// Builds . - /// - /// Async retry policy. - AsyncRetryHandlerPolicy AsyncBuild(); - - /// - /// Builds for extensions to the builder. - /// - /// Async retry policy. - IAsyncPolicy AsyncBuildPolicy(); - - /// - /// Retrives the instance of the retryhandler to accomodate extensions to the builder - /// - RetryHandler PolicyBuilderBase { get; } - } - #endregion - #region Delegates for custom on-retry handlers /// /// The action to call on each retry. @@ -505,7 +237,7 @@ public interface IPolicyBuilderBuild /// A Guid guaranteed to be unique to each execution. Acts as a correlation id so that events specific to a single execution can be identified in logging and telemetry. /// Name of method originating the retriable operation. /// This is called right before the wait. - public delegate void OnRetryHandler(Exception outcome, TimeSpan timespan, int retryCount, Guid correlationId, string caller); + public delegate void OnRetryHandler(Exception outcome, TimeSpan timespan, int retryCount, Guid correlationId, string? caller); /// /// The action to call on each retry. @@ -516,7 +248,7 @@ public interface IPolicyBuilderBuild /// A Guid guaranteed to be unique to each execution. Acts as a correlation id so that events specific to a single execution can be identified in logging and telemetry. /// Name of method originating the retriable operation. /// This is called right before the wait. - public delegate Task OnRetryHandlerAsync(Exception outcome, TimeSpan timespan, int retryCount, Guid correlationId, string caller); + public delegate Task OnRetryHandlerAsync(Exception outcome, TimeSpan timespan, int retryCount, Guid correlationId, string? caller); /// /// The action to call on each retry. @@ -528,7 +260,7 @@ public interface IPolicyBuilderBuild /// A Guid guaranteed to be unique to each execution. Acts as a correlation id so that events specific to a single execution can be identified in logging and telemetry. /// Name of method originating the retriable operation. /// This is called right before the wait. - public delegate void OnRetryHandler(DelegateResult result, TimeSpan timespan, int retryCount, Guid correlationId, string caller); + public delegate void OnRetryHandler(DelegateResult result, TimeSpan timespan, int retryCount, Guid correlationId, string? caller); /// /// The action to call on each retry. @@ -540,374 +272,7 @@ public interface IPolicyBuilderBuild /// A Guid guaranteed to be unique to each execution. Acts as a correlation id so that events specific to a single execution can be identified in logging and telemetry. /// Name of method originating the retriable operation. /// This is called right before the wait. - public delegate Task OnRetryHandlerAsync(DelegateResult result, TimeSpan timespan, int retryCount, Guid correlationId, string caller); - #endregion - - #region Builder interface implementations - private readonly Defaults defaultOptions; - - private readonly struct Defaults - { - public readonly RetryPolicyOptions PolicyOptions; - public readonly RetryHandler PolicyBuilderBase; - - public Defaults(RetryHandler retryHandler, RetryPolicyOptions options) - { - ArgumentNullException.ThrowIfNull(options); - PolicyOptions = options; - PolicyBuilderBase = retryHandler; - } - } - - private readonly struct PolicyBuilderPolicy : IPolicyBuilderPolicy - { - private readonly Defaults Defaults; - - public PolicyBuilderPolicy(Defaults options) - { - ArgumentNullException.ThrowIfNull(options); - Defaults = options; - } - - /// - IPolicyBuilderBase IPolicyBuilderPolicy.OpinionatedRetryPolicy() - => new PolicyBuilderBase(Policy.Handle(), Defaults); - - /// - IPolicyBuilderBase IPolicyBuilderPolicy.OpinionatedRetryPolicy(PolicyBuilder policy) - => new PolicyBuilderBase(policy, Defaults); - - /// - IPolicyBuilderBase IPolicyBuilderPolicy.OpinionatedRetryPolicy(PolicyBuilder policy) - => new PolicyBuilderBase(policy, Defaults); - - private readonly struct PolicyBuilderBase : IPolicyBuilderBase - { - public readonly PolicyBuilder policyBuilder; - public readonly Defaults Defaults; - - public PolicyBuilderBase(PolicyBuilder policyBuilder, Defaults defaults) - { - ArgumentNullException.ThrowIfNull(policyBuilder); - ArgumentNullException.ThrowIfNull(defaults); - this.policyBuilder = policyBuilder; - Defaults = defaults; - } - - public static Func DefaultSleepDurationProvider(Defaults defaults) - => ExponentialSleepDurationProvider(defaults.PolicyOptions.ExponentialBackOffExponent); - - public static Func ExponentialSleepDurationProvider(double exponentialBackOffExponent) - => (attempt, _1, _2) => TimeSpan.FromSeconds(Math.Pow(exponentialBackOffExponent, attempt)); - - public static Func ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(Func sleepDurationProvider, double exponentialBackOffExponent, bool backupSkipProvidedIncrements) - => (attempt, exception, ctx) => - { - return backupSkipProvidedIncrements - ? AdjustAttemptIfNeeded() - : sleepDurationProvider(attempt, exception) ?? ExponentialSleepDurationProvider(exponentialBackOffExponent)(attempt, exception, ctx); - - TimeSpan AdjustAttemptIfNeeded() - { - if (!ctx.TryGetValue(BackupSkipProvidedIncrementKey, out var value) || value is not int || attempt < 2) - { - ctx[BackupSkipProvidedIncrementKey] = value = 0; - } - - var result = sleepDurationProvider(attempt, exception); - - if (result is null) - { - var skipIncrement = (int)value; - attempt -= skipIncrement; - ctx[BackupSkipProvidedIncrementKey] = ++skipIncrement; - result = ExponentialSleepDurationProvider(exponentialBackOffExponent)(attempt, exception, ctx); - } - - return result.Value; - } - }; - - - /// - IPolicyBuilderWait IPolicyBuilderBase.WithRetryPolicyOptionsWait() - => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, DefaultSleepDurationProvider(Defaults)); - - /// - IPolicyBuilderWait IPolicyBuilderBase.WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func sleepDurationProvider) - => new PolicyBuilderWait(this, maxRetryCount, (attempt, outcome, _1) => sleepDurationProvider(attempt, outcome)); - - /// - IPolicyBuilderWait IPolicyBuilderBase.WithExponentialBackoffWait(int retryCount, double exponentialBackOffExponent) - => new PolicyBuilderWait(this, retryCount, ExponentialSleepDurationProvider(exponentialBackOffExponent)); - - /// - IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func sleepDurationProvider, bool backupSkipProvidedIncrements) - => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, Defaults.PolicyOptions.ExponentialBackOffExponent, backupSkipProvidedIncrements)); - - /// - IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithExponentialBackoffBackup(Func sleepDurationProvider, int retryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements) - => new PolicyBuilderWait(this, retryCount, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, exponentialBackOffExponent, backupSkipProvidedIncrements)); - } - - private readonly struct PolicyBuilderBase : IPolicyBuilderBase - { - public readonly PolicyBuilder policyBuilder; - public readonly Defaults Defaults; - - public PolicyBuilderBase(PolicyBuilder policyBuilder, Defaults defaults) - { - ArgumentNullException.ThrowIfNull(policyBuilder); - ArgumentNullException.ThrowIfNull(defaults); - this.policyBuilder = policyBuilder; - Defaults = defaults; - } - - /// - IPolicyBuilderWait IPolicyBuilderBase.WithRetryPolicyOptionsWait() - => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, default, PolicyBuilderBase.DefaultSleepDurationProvider(Defaults)); - - /// - IPolicyBuilderWait IPolicyBuilderBase.WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func waitDurationProvider) - => new PolicyBuilderWait(this, maxRetryCount, default, (attempt, outcome, _1) => waitDurationProvider(attempt, outcome)); - - /// - IPolicyBuilderWait IPolicyBuilderBase.WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func, TimeSpan> sleepDurationProvider) - => new PolicyBuilderWait(this, maxRetryCount, (attempt, outcome, _1) => sleepDurationProvider(attempt, outcome), default); - - /// - IPolicyBuilderWait IPolicyBuilderBase.WithExponentialBackoffWait(int maxRetryCount, double exponentialBackOffExponent) - => new PolicyBuilderWait(this, maxRetryCount, default, PolicyBuilderBase.ExponentialSleepDurationProvider(exponentialBackOffExponent)); - - /// - IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func sleepDurationProvider, bool backupSkipProvidedIncrements) - => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, default, PolicyBuilderBase.ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, Defaults.PolicyOptions.ExponentialBackOffExponent, backupSkipProvidedIncrements)); - - /// - IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithExponentialBackoffBackup(Func sleepDurationProvider, int retryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements) - => new PolicyBuilderWait(this, retryCount, default, PolicyBuilderBase.ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, exponentialBackOffExponent, backupSkipProvidedIncrements)); - } - - private readonly struct PolicyBuilderWait : IPolicyBuilderWait - { - public readonly PolicyBuilderBase policyBuilder; - public readonly Func sleepDurationProvider; - public readonly int maxRetryCount; - - public PolicyBuilderWait(PolicyBuilderBase policyBuilder, int maxRetryCount, Func sleepDurationProvider) - { - ArgumentNullException.ThrowIfNull(sleepDurationProvider); - this.policyBuilder = policyBuilder; - this.maxRetryCount = maxRetryCount; - this.sleepDurationProvider = sleepDurationProvider; - } - - /// - IPolicyBuilderBuild IPolicyBuilderWait.SetOnRetryBehavior(ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) - => new PolicyBuilderBuild(this, sleepDurationProvider, logger, onRetry, onRetryAsync); - } - - private readonly struct PolicyBuilderWait : IPolicyBuilderWait - { - public readonly PolicyBuilderBase policyBuilder; - public readonly Func sleepDurationProvider; - public readonly Func, Context, TimeSpan> genericSleepDurationProvider; - public readonly int maxRetryCount; - - private static Func, Context, TimeSpan> PickSleepDurationProvider(Func, Context, TimeSpan> tResultProvider, Func exceptionProvider) - => tResultProvider is null ? (attempt, outcome, ctx) => exceptionProvider(attempt, outcome.Exception, ctx) : tResultProvider; - - public PolicyBuilderWait(PolicyBuilderBase policyBuilder, int maxRetryCount, Func, Context, TimeSpan> sleepDurationProviderResult, Func sleepDurationProviderException) - { - if (sleepDurationProviderException is null && sleepDurationProviderResult is null) - { - throw new ArgumentNullException(null, $"At least one of {nameof(sleepDurationProviderResult)} or {nameof(sleepDurationProviderException)} must be provided."); - } - - this.policyBuilder = policyBuilder; - this.maxRetryCount = maxRetryCount; - this.sleepDurationProvider = sleepDurationProviderException; - this.genericSleepDurationProvider = sleepDurationProviderResult; - } - - /// - IPolicyBuilderBuild IPolicyBuilderWait.SetOnRetryBehavior(ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) - => new PolicyBuilderBuild(this, PickSleepDurationProvider(genericSleepDurationProvider, sleepDurationProvider), logger, onRetry, onRetryAsync); - } - - private readonly struct PolicyBuilderBuild : IPolicyBuilderBuild - { - private readonly PolicyBuilderWait policyBuilder; - private readonly Func sleepDurationProvider; - private readonly ILogger logger; - private readonly OnRetryHandler onRetryHandler; - private readonly OnRetryHandlerAsync onRetryHandlerAsync; - - /// - public RetryHandler PolicyBuilderBase { get; } - - public PolicyBuilderBuild(PolicyBuilderWait policyBuilder, Func sleepDurationProvider, ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) - { - ArgumentNullException.ThrowIfNull(sleepDurationProvider); - this.policyBuilder = policyBuilder; - this.sleepDurationProvider = sleepDurationProvider; - this.logger = logger; - this.onRetryHandler = onRetry; - this.onRetryHandlerAsync = onRetryAsync; - this.PolicyBuilderBase = policyBuilder.policyBuilder.Defaults.PolicyBuilderBase; - } - - public static Action Logger(ILogger logger) - { - return (exception, timeSpan, retryCount, ctx) => - logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan:c} CorrelationId: {CorrelationId:D} ErrorMessage: {ExceptionMessage}", ctx[CallerMemberNameKey], retryCount, timeSpan, ctx.CorrelationId, exception.Message); - } - - public static Action OnRetryHandler(ILogger logger, OnRetryHandler onRetryHandler) - { - var handler = onRetryHandler ?? new((exception, timeSpan, retryCount, correlationId, caller) => { }); - - return (exception, timeSpan, retryCount, ctx) => - { - handler(exception, timeSpan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); - Logger(logger)(exception, timeSpan, retryCount, ctx); - }; - } - - public static Func OnRetryHandlerAsync(ILogger logger, OnRetryHandler onRetryHandler, OnRetryHandlerAsync onRetryHandlerAsync) - { - var handler = onRetryHandler ?? new((exception, timeSpan, retryCount, correlationId, caller) => { }); - var handlerAsync = onRetryHandlerAsync ?? new((exception, timeSpan, retryCount, correlationId, caller) => - { - handler(exception, timeSpan, retryCount, correlationId, caller); - return Task.CompletedTask; - }); - - return async (exception, timeSpan, retryCount, ctx) => - { - await handlerAsync(exception, timeSpan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); - Logger(logger)(exception, timeSpan, retryCount, ctx); - }; - } - - /// - ISyncPolicy IPolicyBuilderBuild.SyncBuildPolicy() - { - var waitProvider = sleepDurationProvider; - var onRetryProvider = OnRetryHandler(logger, onRetryHandler); - - return policyBuilder.policyBuilder.policyBuilder.WaitAndRetry(policyBuilder.maxRetryCount, (attempt, ctx) => waitProvider(attempt, default, ctx), onRetryProvider); - } - - /// - IAsyncPolicy IPolicyBuilderBuild.AsyncBuildPolicy() - { - var waitProvider = sleepDurationProvider; - var onRetryProvider = OnRetryHandlerAsync(logger, onRetryHandler, onRetryHandlerAsync); - - return policyBuilder.policyBuilder.policyBuilder.WaitAndRetryAsync(policyBuilder.maxRetryCount, waitProvider, onRetryProvider); - } - - /// - RetryHandlerPolicy IPolicyBuilderBuild.SyncBuild() - => new(((IPolicyBuilderBuild)this).SyncBuildPolicy()); - - /// - AsyncRetryHandlerPolicy IPolicyBuilderBuild.AsyncBuild() - => new(((IPolicyBuilderBuild)this).AsyncBuildPolicy()); - } - - private readonly struct PolicyBuilderBuild : IPolicyBuilderBuild - { - private readonly PolicyBuilderWait policyBuilder; - private readonly Func, Context, TimeSpan> sleepDurationProvider; - private readonly ILogger logger; - private readonly OnRetryHandler onRetryHandler; - private readonly OnRetryHandlerAsync onRetryHandlerAsync; - - /// - public RetryHandler PolicyBuilderBase { get; } - - public PolicyBuilderBuild(PolicyBuilderWait policyBuilder, Func, Context, TimeSpan> sleepDurationProvider, ILogger logger, OnRetryHandler onRetry, OnRetryHandlerAsync onRetryAsync) - { - ArgumentNullException.ThrowIfNull(sleepDurationProvider); - this.policyBuilder = policyBuilder; - this.sleepDurationProvider = sleepDurationProvider; - this.logger = logger; - this.onRetryHandler = onRetry; - this.onRetryHandlerAsync = onRetryAsync; - this.PolicyBuilderBase = policyBuilder.policyBuilder.Defaults.PolicyBuilderBase; - } - - private static Action, TimeSpan, int, Context> Logger(ILogger logger) - { - return (outcome, timeSpan, retryCount, ctx) => - { - if (outcome.Exception is null) - { - logger?.LogError(@"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan:c} CorrelationId: {CorrelationId:D}", ctx[CallerMemberNameKey], retryCount, timeSpan, ctx.CorrelationId); - } - else - { - logger?.LogError(outcome.Exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan:c} CorrelationId: {CorrelationId:D} ErrorMessage: {ExceptionMessage}", ctx[CallerMemberNameKey], retryCount, timeSpan, ctx.CorrelationId, outcome.Exception.Message); - } - }; - } - - //private static Action, TimeSpan, int, Context> OnRetryHandler(ILogger logger, OnRetryHandler onRetryHandler) - //{ - // var genericHandler = onRetryHandler ?? new((outcome, timeSpan, retryCount, correlationId, caller) => { }); - - // return (outcome, timeSpan, retryCount, ctx) => - // { - // genericHandler(outcome, timeSpan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); - // Logger(logger)(outcome, timeSpan, retryCount, ctx); - // }; - //} - - private static Func, TimeSpan, int, Context, Task> OnRetryHandlerAsync(ILogger logger, OnRetryHandler onRetryHandler, OnRetryHandlerAsync onRetryHandlerAsync) - { - var handler = onRetryHandler ?? new((outcome, timeSpan, retryCount, correlationId, caller) => { }); - var asyncHandler = onRetryHandlerAsync ?? new((outcome, timespan, retryCount, correlationId, caller) => - { - handler(outcome, timespan, retryCount, correlationId, caller); - return Task.CompletedTask; - }); - - return async (outcome, timespan, retryCount, ctx) => - { - await asyncHandler(outcome, timespan, retryCount, ctx.CorrelationId, ctx[CallerMemberNameKey] as string); - Logger(logger)(outcome, timespan, retryCount, ctx); - }; - } - - ///// - //ISyncPolicy IPolicyBuilderBuild.BuildPolicy() - //{ - // var waitProvider = sleepDurationProvider; - // var onRetryProvider = OnRetryHandler(logger, onRetryHandler); - - // return policyBuilder.policyBuilder.policyBuilder.WaitAndRetry(policyBuilder.retryCount, waitProvider, onRetryProvider); - //} - - /// - IAsyncPolicy IPolicyBuilderBuild.AsyncBuildPolicy() - { - var waitProvider = sleepDurationProvider; - var onRetryProvider = OnRetryHandlerAsync(logger, onRetryHandler, onRetryHandlerAsync); - - return policyBuilder.policyBuilder.policyBuilder.WaitAndRetryAsync(policyBuilder.maxRetryCount, waitProvider, onRetryProvider); - } - - ///// - //RetryHandlerPolicy IPolicyBuilderBuild.Build() - // => new(((IPolicyBuilderBuild)this).BuildPolicy()); - - /// - AsyncRetryHandlerPolicy IPolicyBuilderBuild.AsyncBuild() - => new(((IPolicyBuilderBuild)this).AsyncBuildPolicy()); - } - } + public delegate Task OnRetryHandlerAsync(DelegateResult result, TimeSpan timespan, int retryCount, Guid correlationId, string? caller); #endregion public static Context PrepareContext(string? caller) => new() diff --git a/src/CommonUtilities/RetryPolicyBuilder.cs b/src/CommonUtilities/RetryPolicyBuilder.cs new file mode 100644 index 000000000..78934446f --- /dev/null +++ b/src/CommonUtilities/RetryPolicyBuilder.cs @@ -0,0 +1,647 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Net; +using CommonUtilities.Options; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using Polly; +using Polly.Retry; + +namespace CommonUtilities; + +/// +/// Utility class that facilitates the retry policy implementations for HTTP clients. +/// +public class RetryPolicyBuilder +{ + /// + /// The main starting point for building retry policies + /// + public IPolicyBuilderPolicy PolicyBuilder => new PolicyBuilderPolicy(defaultOptions); + + /// + /// The default HttpResponseMessage retry policy builder. + /// + public static PolicyBuilder DefaultHttpResponseMessagePolicyBuilder => + Polly.Extensions.Http.HttpPolicyExtensions.HandleTransientHttpError() + .OrResult(r => r.StatusCode == HttpStatusCode.TooManyRequests); + + /// Shortcut starting point for testing. Can be used in production as well. + public virtual IPolicyBuilderWait DefaultRetryPolicyBuilder() + => PolicyBuilder + .OpinionatedRetryPolicy() + .WithRetryPolicyOptionsWait(); + + /// Shortcut starting point for testing. Can be used in production as well. + public virtual IPolicyBuilderWait DefaultRetryHttpResponseMessagePolicyBuilder() + => PolicyBuilder + .OpinionatedRetryPolicy(DefaultHttpResponseMessagePolicyBuilder) + .WithRetryPolicyOptionsWait(); + + /// + /// Public constructor + /// + /// Retry policy options + public RetryPolicyBuilder(IOptions retryPolicyOptions) + { + ArgumentNullException.ThrowIfNull(retryPolicyOptions); + defaultOptions = new(this, retryPolicyOptions.Value); + } + + /// + /// Parameter-less constructor for mocking + /// + protected RetryPolicyBuilder() { } + + #region Builder interfaces + /// + /// Selects retry policy type. + /// + public interface IPolicyBuilderPolicy + { + /// + /// Default retry policy. + /// + /// retry policy builder. + IPolicyBuilderBase OpinionatedRetryPolicy(); + + /// + /// Custom retry policy. + /// + /// Builder class that holds the list of current exception predicates. + /// Custom retry policy builder. + IPolicyBuilderBase OpinionatedRetryPolicy(PolicyBuilder policyBuilder); + + /// + /// Generic retry policy. + /// + /// Result values. + /// Builder class that holds the list of current execution predicates filtering TResult result values. + /// Generic retry policy builder. + IPolicyBuilderBase OpinionatedRetryPolicy(PolicyBuilder policyBuilder); + } + + /// + /// Selects retry policy wait algorithm. + /// + public interface IPolicyBuilderBase + { + /// + /// Default wait policy. + /// + /// OnRetry hander + IPolicyBuilderWait WithRetryPolicyOptionsWait(); + + /// + /// Custom exponential wait policy. + /// + /// Maximum number of retries. + /// Value in seconds which is raised by the power of the retry attempt. + /// OnRetry hander + IPolicyBuilderWait WithExponentialBackoffWait(int maxRetryCount, double exponentialBackOffExponent); + + /// + /// Custom exception-based wait policy. + /// + /// Maximum number of retries. + /// Wait policy. + /// OnRetry hander + IPolicyBuilderWait WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func waitDurationProvider); + + /// + /// Custom optional exception-based wait policy backed up by an exponential wait policy. + /// + /// Wait policy that can return to use the backup wait policy. + /// Maximum number of retries. + /// Value in seconds which is raised by the power of the backup retry attempt. + /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. + /// OnRetry hander + IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, int maxRetryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements); + + /// + /// Custom optional exception-based wait policy backed up by the default wait policy. + /// + /// Wait policy that can return to use the backup wait policy. + /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. + /// OnRetry hander + IPolicyBuilderWait WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func waitDurationProvider, bool backupSkipProvidedIncrements); + } + + /// + /// Selects retry policy wait algorithm. + /// + public interface IPolicyBuilderBase + { + /// + /// Default wait policy. + /// + /// OnRetry hander + IPolicyBuilderWait WithRetryPolicyOptionsWait(); + + /// + /// Custom exponential wait policy. + /// + /// Maximum number of retries. + /// Value in seconds which is raised by the power of the retry attempt. + /// OnRetry hander + IPolicyBuilderWait WithExponentialBackoffWait(int maxRetryCount, double exponentialBackOffExponent); + + /// + /// Custom result-based policy. + /// + /// Maximum number of retries. + /// Wait policy. + /// Wait policy. + /// OnRetry hander + IPolicyBuilderWait WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func, TimeSpan> waitDurationProvider); + + /// + /// Custom exception-based wait policy. + /// + /// Maximum number of retries. + /// Wait policy. + /// OnRetry hander + IPolicyBuilderWait WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func waitDurationProvider); + + /// + /// Custom optional exception-based wait policy backed up by an exponential wait policy. + /// + /// Wait policy that can return to use the backup wait policy. + /// Maximum number of retries. + /// Value in seconds which is raised by the power of the backup retry attempt. + /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. + /// OnRetry hander + IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, int retryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements); + + /// + /// Custom optional exception-based wait policy backed up by the default wait policy. + /// + /// Wait policy that can return to use the backup wait policy. + /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. + /// OnRetry hander + IPolicyBuilderWait WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func waitDurationProvider, bool backupSkipProvidedIncrements); + } + + /// + /// Sets on-retry handlers. + /// + public interface IPolicyBuilderWait + { + /// + /// OnRetry behaviors + /// + /// Logger to enable retry logging. + /// Custom onretry handler. + /// Custom async onretry handler. Only applies to . + /// OnRetry builder + IPolicyBuilderBuild SetOnRetryBehavior(ILogger? logger = default, RetryHandler.OnRetryHandler? onRetry = default, RetryHandler.OnRetryHandlerAsync? onRetryAsync = default); + } + + /// + /// Sets on-retry handlers. + /// + public interface IPolicyBuilderWait + { + /// + /// OnRetry behaviors + /// + /// Logger to enable retry logging. + /// Custom onretry handler. + /// Custom async onretry handler. Only applies to . + /// OnRetry builder + IPolicyBuilderBuild SetOnRetryBehavior(ILogger? logger = default, RetryHandler.OnRetryHandler? onRetry = default, RetryHandler.OnRetryHandlerAsync? onRetryAsync = default); + } + + public interface IPolicyBuilderBuild + { + /// + /// Builds . + /// + /// Retry policy. + RetryHandler.RetryHandlerPolicy SyncBuild(); + + /// + /// Builds for extensions to the builder. + /// + /// Retry policy. + ISyncPolicy SyncBuildPolicy(); + + /// + /// Builds . + /// + /// Async retry policy. + RetryHandler.AsyncRetryHandlerPolicy AsyncBuild(); + + /// + /// Builds for extensions to the builder. + /// + /// Async retry policy. + IAsyncPolicy AsyncBuildPolicy(); + + /// + /// Retrives the instance of the retryhandler to accomodate extensions to the builder + /// + RetryPolicyBuilder PolicyBuilderBase { get; } + } + + public interface IPolicyBuilderBuild + { + ///// + ///// Builds . + ///// + ///// Retry policy. + //RetryHandlerPolicy SyncBuild(); + + ///// + ///// Builds for extensions to the builder. + ///// + ///// Retry policy. + //ISyncPolicy SyncBuildPolicy(); + + /// + /// Builds . + /// + /// Async retry policy. + RetryHandler.AsyncRetryHandlerPolicy AsyncBuild(); + + /// + /// Builds for extensions to the builder. + /// + /// Async retry policy. + IAsyncPolicy AsyncBuildPolicy(); + + /// + /// Retrives the instance of the retryhandler to accomodate extensions to the builder + /// + RetryPolicyBuilder PolicyBuilderBase { get; } + } + #endregion + + #region Builder interface implementations + private readonly Defaults defaultOptions; + + private readonly struct Defaults + { + public readonly RetryPolicyOptions PolicyOptions; + public readonly RetryPolicyBuilder PolicyBuilderBase; + + public Defaults(RetryPolicyBuilder retryHandler, RetryPolicyOptions options) + { + ArgumentNullException.ThrowIfNull(options); + PolicyOptions = options; + PolicyBuilderBase = retryHandler; + } + } + + private readonly struct PolicyBuilderPolicy : IPolicyBuilderPolicy + { + private readonly Defaults Defaults; + + public PolicyBuilderPolicy(Defaults options) + { + ArgumentNullException.ThrowIfNull(options); + Defaults = options; + } + + /// + IPolicyBuilderBase IPolicyBuilderPolicy.OpinionatedRetryPolicy() + => new PolicyBuilderBase(Policy.Handle(), Defaults); + + /// + IPolicyBuilderBase IPolicyBuilderPolicy.OpinionatedRetryPolicy(PolicyBuilder policy) + => new PolicyBuilderBase(policy, Defaults); + + /// + IPolicyBuilderBase IPolicyBuilderPolicy.OpinionatedRetryPolicy(PolicyBuilder policy) + => new PolicyBuilderBase(policy, Defaults); + + private readonly struct PolicyBuilderBase : IPolicyBuilderBase + { + public readonly PolicyBuilder policyBuilder; + public readonly Defaults Defaults; + + public PolicyBuilderBase(PolicyBuilder policyBuilder, Defaults defaults) + { + ArgumentNullException.ThrowIfNull(policyBuilder); + ArgumentNullException.ThrowIfNull(defaults); + this.policyBuilder = policyBuilder; + Defaults = defaults; + } + + public static Func DefaultSleepDurationProvider(Defaults defaults) + => ExponentialSleepDurationProvider(defaults.PolicyOptions.ExponentialBackOffExponent); + + public static Func ExponentialSleepDurationProvider(double exponentialBackOffExponent) + => (attempt, _1, _2) => TimeSpan.FromSeconds(Math.Pow(exponentialBackOffExponent, attempt)); + + public static Func ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(Func sleepDurationProvider, double exponentialBackOffExponent, bool backupSkipProvidedIncrements) + => (attempt, exception, ctx) => + { + return backupSkipProvidedIncrements + ? AdjustAttemptIfNeeded() + : sleepDurationProvider(attempt, exception) ?? ExponentialSleepDurationProvider(exponentialBackOffExponent)(attempt, exception, ctx); + + TimeSpan AdjustAttemptIfNeeded() + { + if (!ctx.TryGetValue(RetryHandler.BackupSkipProvidedIncrementKey, out var value) || value is not int || attempt < 2) + { + ctx[RetryHandler.BackupSkipProvidedIncrementKey] = value = 0; + } + + var result = sleepDurationProvider(attempt, exception); + + if (result is null) + { + var skipIncrement = (int)value; + attempt -= skipIncrement; + ctx[RetryHandler.BackupSkipProvidedIncrementKey] = ++skipIncrement; + result = ExponentialSleepDurationProvider(exponentialBackOffExponent)(attempt, exception, ctx); + } + + return result.Value; + } + }; + + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithRetryPolicyOptionsWait() + => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, DefaultSleepDurationProvider(Defaults)); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func sleepDurationProvider) + => new PolicyBuilderWait(this, maxRetryCount, (attempt, outcome, _1) => sleepDurationProvider(attempt, outcome)); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithExponentialBackoffWait(int retryCount, double exponentialBackOffExponent) + => new PolicyBuilderWait(this, retryCount, ExponentialSleepDurationProvider(exponentialBackOffExponent)); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func sleepDurationProvider, bool backupSkipProvidedIncrements) + => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, Defaults.PolicyOptions.ExponentialBackOffExponent, backupSkipProvidedIncrements)); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithExponentialBackoffBackup(Func sleepDurationProvider, int retryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements) + => new PolicyBuilderWait(this, retryCount, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, exponentialBackOffExponent, backupSkipProvidedIncrements)); + } + + private readonly struct PolicyBuilderBase : IPolicyBuilderBase + { + public readonly PolicyBuilder policyBuilder; + public readonly Defaults Defaults; + + public PolicyBuilderBase(PolicyBuilder policyBuilder, Defaults defaults) + { + ArgumentNullException.ThrowIfNull(policyBuilder); + ArgumentNullException.ThrowIfNull(defaults); + this.policyBuilder = policyBuilder; + Defaults = defaults; + } + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithRetryPolicyOptionsWait() + => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, default, PolicyBuilderBase.DefaultSleepDurationProvider(Defaults)); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func waitDurationProvider) + => new PolicyBuilderWait(this, maxRetryCount, default, (attempt, outcome, _1) => waitDurationProvider(attempt, outcome)); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func, TimeSpan> sleepDurationProvider) + => new PolicyBuilderWait(this, maxRetryCount, (attempt, outcome, _1) => sleepDurationProvider(attempt, outcome), default); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithExponentialBackoffWait(int maxRetryCount, double exponentialBackOffExponent) + => new PolicyBuilderWait(this, maxRetryCount, default, PolicyBuilderBase.ExponentialSleepDurationProvider(exponentialBackOffExponent)); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func sleepDurationProvider, bool backupSkipProvidedIncrements) + => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, default, PolicyBuilderBase.ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, Defaults.PolicyOptions.ExponentialBackOffExponent, backupSkipProvidedIncrements)); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithExponentialBackoffBackup(Func sleepDurationProvider, int retryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements) + => new PolicyBuilderWait(this, retryCount, default, PolicyBuilderBase.ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, exponentialBackOffExponent, backupSkipProvidedIncrements)); + } + + private readonly struct PolicyBuilderWait : IPolicyBuilderWait + { + public readonly PolicyBuilderBase builderBase; + public readonly Func sleepDurationProvider; + public readonly int maxRetryCount; + + public PolicyBuilderWait(PolicyBuilderBase builderBase, int maxRetryCount, Func sleepDurationProvider) + { + ArgumentNullException.ThrowIfNull(sleepDurationProvider); + this.builderBase = builderBase; + this.maxRetryCount = maxRetryCount; + this.sleepDurationProvider = sleepDurationProvider; + } + + /// + IPolicyBuilderBuild IPolicyBuilderWait.SetOnRetryBehavior(ILogger? logger, RetryHandler.OnRetryHandler? onRetry, RetryHandler.OnRetryHandlerAsync? onRetryAsync) + => new PolicyBuilderBuild(this, sleepDurationProvider, logger, onRetry, onRetryAsync); + } + + private readonly struct PolicyBuilderWait : IPolicyBuilderWait + { + public readonly PolicyBuilderBase builderBase; + public readonly Func? sleepDurationProvider; + public readonly Func, Context, TimeSpan>? genericSleepDurationProvider; + public readonly int maxRetryCount; + + private static Func, Context, TimeSpan> PickSleepDurationProvider(Func, Context, TimeSpan>? tResultProvider, Func? exceptionProvider) + => tResultProvider is null ? (attempt, outcome, ctx) => exceptionProvider!(attempt, outcome.Exception, ctx) : tResultProvider; + + public PolicyBuilderWait(PolicyBuilderBase builderBase, int maxRetryCount, Func, Context, TimeSpan>? sleepDurationProviderResult, Func? sleepDurationProviderException) + { + if (sleepDurationProviderException is null && sleepDurationProviderResult is null) + { + throw new ArgumentNullException(null, $"At least one of {nameof(sleepDurationProviderResult)} or {nameof(sleepDurationProviderException)} must be provided."); + } + + this.builderBase = builderBase; + this.maxRetryCount = maxRetryCount; + this.sleepDurationProvider = sleepDurationProviderException; + this.genericSleepDurationProvider = sleepDurationProviderResult; + } + + /// + IPolicyBuilderBuild IPolicyBuilderWait.SetOnRetryBehavior(ILogger? logger, RetryHandler.OnRetryHandler? onRetry, RetryHandler.OnRetryHandlerAsync? onRetryAsync) + => new PolicyBuilderBuild(this, PickSleepDurationProvider(genericSleepDurationProvider, sleepDurationProvider), logger, onRetry, onRetryAsync); + } + + private readonly struct PolicyBuilderBuild : IPolicyBuilderBuild + { + private readonly PolicyBuilderWait builderWait; + private readonly Func sleepDurationProvider; + private readonly ILogger? logger; + private readonly RetryHandler.OnRetryHandler? onRetryHandler; + private readonly RetryHandler.OnRetryHandlerAsync? onRetryHandlerAsync; + + /// + public RetryPolicyBuilder PolicyBuilderBase { get; } + + public PolicyBuilderBuild(PolicyBuilderWait builderWait, Func sleepDurationProvider, ILogger? logger, RetryHandler.OnRetryHandler? onRetry, RetryHandler.OnRetryHandlerAsync? onRetryAsync) + { + ArgumentNullException.ThrowIfNull(sleepDurationProvider); + this.builderWait = builderWait; + this.sleepDurationProvider = sleepDurationProvider; + this.logger = logger; + this.onRetryHandler = onRetry; + this.onRetryHandlerAsync = onRetryAsync; + this.PolicyBuilderBase = builderWait.builderBase.Defaults.PolicyBuilderBase; + } + + public static Action Logger(ILogger? logger) + { + return (exception, timeSpan, retryCount, ctx) => + logger?.LogError(exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan:c} CorrelationId: {CorrelationId:D} ErrorMessage: {ExceptionMessage}", ctx[RetryHandler.CallerMemberNameKey], retryCount, timeSpan, ctx.CorrelationId, exception.Message); + } + + public static Action OnRetryHandler(ILogger? logger, RetryHandler.OnRetryHandler? onRetryHandler) + { + var handler = onRetryHandler ?? new((exception, timeSpan, retryCount, correlationId, caller) => { }); + + return (exception, timeSpan, retryCount, ctx) => + { + handler(exception, timeSpan, retryCount, ctx.CorrelationId, ctx[RetryHandler.CallerMemberNameKey] as string); + Logger(logger)(exception, timeSpan, retryCount, ctx); + }; + } + + public static Func OnRetryHandlerAsync(ILogger? logger, RetryHandler.OnRetryHandler? onRetryHandler, RetryHandler.OnRetryHandlerAsync? onRetryHandlerAsync) + { + var handler = onRetryHandler ?? new((exception, timeSpan, retryCount, correlationId, caller) => { }); + var handlerAsync = onRetryHandlerAsync ?? new((exception, timeSpan, retryCount, correlationId, caller) => + { + handler(exception, timeSpan, retryCount, correlationId, caller); + return Task.CompletedTask; + }); + + return async (exception, timeSpan, retryCount, ctx) => + { + await handlerAsync(exception, timeSpan, retryCount, ctx.CorrelationId, ctx[RetryHandler.CallerMemberNameKey] as string); + Logger(logger)(exception, timeSpan, retryCount, ctx); + }; + } + + /// + ISyncPolicy IPolicyBuilderBuild.SyncBuildPolicy() + { + var waitProvider = sleepDurationProvider; + var onRetryProvider = OnRetryHandler(logger, onRetryHandler); + + return builderWait.builderBase.policyBuilder.WaitAndRetry(builderWait.maxRetryCount, (attempt, ctx) => waitProvider(attempt, default, ctx), onRetryProvider); + } + + /// + IAsyncPolicy IPolicyBuilderBuild.AsyncBuildPolicy() + { + var waitProvider = sleepDurationProvider; + var onRetryProvider = OnRetryHandlerAsync(logger, onRetryHandler, onRetryHandlerAsync); + + return builderWait.builderBase.policyBuilder.WaitAndRetryAsync(builderWait.maxRetryCount, waitProvider, onRetryProvider); + } + + /// + RetryHandler.RetryHandlerPolicy IPolicyBuilderBuild.SyncBuild() + => new(((IPolicyBuilderBuild)this).SyncBuildPolicy()); + + /// + RetryHandler.AsyncRetryHandlerPolicy IPolicyBuilderBuild.AsyncBuild() + => new(((IPolicyBuilderBuild)this).AsyncBuildPolicy()); + } + + private readonly struct PolicyBuilderBuild : IPolicyBuilderBuild + { + private readonly PolicyBuilderWait builderWait; + private readonly Func, Context, TimeSpan> sleepDurationProvider; + private readonly ILogger? logger; + private readonly RetryHandler.OnRetryHandler? onRetryHandler; + private readonly RetryHandler.OnRetryHandlerAsync? onRetryHandlerAsync; + + /// + public RetryPolicyBuilder PolicyBuilderBase { get; } + + public PolicyBuilderBuild(PolicyBuilderWait builderWait, Func, Context, TimeSpan> sleepDurationProvider, ILogger? logger, RetryHandler.OnRetryHandler? onRetry, RetryHandler.OnRetryHandlerAsync? onRetryAsync) + { + ArgumentNullException.ThrowIfNull(sleepDurationProvider); + this.builderWait = builderWait; + this.sleepDurationProvider = sleepDurationProvider; + this.logger = logger; + this.onRetryHandler = onRetry; + this.onRetryHandlerAsync = onRetryAsync; + this.PolicyBuilderBase = builderWait.builderBase.Defaults.PolicyBuilderBase; + } + + private static Action, TimeSpan, int, Context> Logger(ILogger? logger) + { + return (outcome, timeSpan, retryCount, ctx) => + { + if (outcome.Exception is null) + { + logger?.LogError(@"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan:c} CorrelationId: {CorrelationId:D}", ctx[RetryHandler.CallerMemberNameKey], retryCount, timeSpan, ctx.CorrelationId); + } + else + { + logger?.LogError(outcome.Exception, @"Retrying in {Method}: RetryCount: {RetryCount} TimeSpan: {TimeSpan:c} CorrelationId: {CorrelationId:D} ErrorMessage: {ExceptionMessage}", ctx[RetryHandler.CallerMemberNameKey], retryCount, timeSpan, ctx.CorrelationId, outcome.Exception.Message); + } + }; + } + + //private static Action, TimeSpan, int, Context> OnRetryHandler(ILogger? logger, RetryHandler.OnRetryHandler>? onRetryHandler) + //{ + // var genericHandler = onRetryHandler ?? new((outcome, timeSpan, retryCount, correlationId, caller) => { }); + + // return (outcome, timeSpan, retryCount, ctx) => + // { + // genericHandler(outcome, timeSpan, retryCount, ctx.CorrelationId, ctx[RetryHandler.CallerMemberNameKey] as string); + // Logger(logger)(outcome, timeSpan, retryCount, ctx); + // }; + //} + + private static Func, TimeSpan, int, Context, Task> OnRetryHandlerAsync(ILogger? logger, RetryHandler.OnRetryHandler? onRetryHandler, RetryHandler.OnRetryHandlerAsync? onRetryHandlerAsync) + { + var handler = onRetryHandler ?? new((outcome, timeSpan, retryCount, correlationId, caller) => { }); + var asyncHandler = onRetryHandlerAsync ?? new((outcome, timespan, retryCount, correlationId, caller) => + { + handler(outcome, timespan, retryCount, correlationId, caller); + return Task.CompletedTask; + }); + + return async (outcome, timespan, retryCount, ctx) => + { + await asyncHandler(outcome, timespan, retryCount, ctx.CorrelationId, ctx[RetryHandler.CallerMemberNameKey] as string); + Logger(logger)(outcome, timespan, retryCount, ctx); + }; + } + + ///// + //ISyncPolicy IPolicyBuilderBuild.BuildPolicy() + //{ + // var waitProvider = sleepDurationProvider; + // var onRetryProvider = OnRetryHandler(logger, onRetryHandler); + + // return builderWait.builderBase.policyBuilder.WaitAndRetry(builderWait.retryCount, waitProvider, onRetryProvider); + //} + + /// + IAsyncPolicy IPolicyBuilderBuild.AsyncBuildPolicy() + { + var waitProvider = sleepDurationProvider; + var onRetryProvider = OnRetryHandlerAsync(logger, onRetryHandler, onRetryHandlerAsync); + + return builderWait.builderBase.policyBuilder.WaitAndRetryAsync(builderWait.maxRetryCount, waitProvider, onRetryProvider); + } + + ///// + //RetryHandlerPolicy IPolicyBuilderBuild.Build() + // => new(((IPolicyBuilderBuild)this).BuildPolicy()); + + /// + RetryHandler.AsyncRetryHandlerPolicy IPolicyBuilderBuild.AsyncBuild() + => new(((IPolicyBuilderBuild)this).AsyncBuildPolicy()); + } + } + #endregion +} diff --git a/src/CommonUtilities/Base32.cs b/src/CommonUtilities/UtilityExtensions.cs similarity index 64% rename from src/CommonUtilities/Base32.cs rename to src/CommonUtilities/UtilityExtensions.cs index 76be93fc3..50f056d7d 100644 --- a/src/CommonUtilities/Base32.cs +++ b/src/CommonUtilities/UtilityExtensions.cs @@ -5,8 +5,9 @@ namespace CommonUtilities { - public static class Base32 + public static class UtilityExtensions { + #region RFC 4648 Base32 private static readonly char[] Rfc4648Base32 = new[] { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '2', '3', '4', '5', '6', '7' }; private const int GroupBitlength = 5; private const int BitsPerByte = 8; @@ -17,7 +18,8 @@ public static class Base32 /// /// Data to convert. /// RFC 4648 Base32 representation - public static string ConvertToBase32(byte[] bytes) // https://datatracker.ietf.org/doc/html/rfc4648#section-6 + /// https://datatracker.ietf.org/doc/html/rfc4648#section-6 + public static string ConvertToBase32(this byte[] bytes) // The RFC 4648 Base32 algorithm requires that each byte be presented in MSB order, but BitArray on every platform presents them in LSB order. => new string(new BitArray(bytes).Cast() @@ -44,27 +46,28 @@ public static string ConvertToBase32(byte[] bytes) // https://datatracker.ietf.o 4 => @"=", _ => throw new InvalidOperationException(), // Keeps the compiler happy. }; + #endregion /// /// Converts each group (fixed number) of items into a new item /// /// Type of source items - /// Intermediate type + /// Intermediate type /// Type of the resultant items - /// The source enumerable of type . - /// The size of each group to create out of the entire enumeration. The last group may be smaller. - /// The function that prepares each into the value expected by . Its parameters are an item of type and the index of that item (starting from zero) within each group. - /// The function that creates the from each group of . + /// The source enumerable of type . + /// The size of each group to create out of the entire enumeration. The last group may be smaller. + /// The function that prepares each into the value expected by . Its parameters are an item of type and the index of that item (starting from zero) within each group. + /// The function that creates the from each group of items. /// An enumeration of from all of the groups. - private static IEnumerable ConvertGroup( - this IEnumerable ts, - int itemsPerGroup, - Func groupMemberFunc, - Func, TResult> groupResultFunc) - => ts + public static IEnumerable ConvertGroup( + this IEnumerable source, + int groupSize, + Func groupItemFunc, + Func, TResult> groupResultFunc) + => source .Select((value, index) => (Index: index, Value: value)) - .GroupBy(tuple => tuple.Index / itemsPerGroup) + .GroupBy(tuple => tuple.Index / groupSize) .OrderBy(tuple => tuple.Key) - .Select(items => groupResultFunc(items.Select(i => groupMemberFunc(i.Value, i.Index % itemsPerGroup)))); + .Select(groups => groupResultFunc(groups.Select(item => groupItemFunc(item.Value, item.Index % groupSize)))); } } diff --git a/src/GenerateBatchVmSkus/Program.cs b/src/GenerateBatchVmSkus/Program.cs index d91253ef1..20e580599 100644 --- a/src/GenerateBatchVmSkus/Program.cs +++ b/src/GenerateBatchVmSkus/Program.cs @@ -89,7 +89,7 @@ static async Task RunAsync(Configuration configuration) var appCache = new MemoryCache(new MemoryCacheOptions()); var options = new Mock>(); options.Setup(o => o.Value).Returns(new RetryPolicyOptions()); - var cacheAndRetryHandler = new CachingRetryHandler(appCache, options.Object); + var cacheAndRetryHandler = new CachingRetryPolicyBuilder(appCache, options.Object); var priceApiClient = new PriceApiClient(cacheAndRetryHandler, new NullLogger()); static double ConvertMiBToGiB(int value) => Math.Round(value / 1024.0, 2); diff --git a/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs b/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs index bed34376e..b0f56d53e 100644 --- a/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs +++ b/src/Tes.ApiClients.Tests/CacheAndRetryHandlerTest.cs @@ -25,7 +25,7 @@ public void SetUp() appCache = new MemoryCache(new MemoryCacheOptions()); mockInstanceToRetry = new Mock(); mockOptions.SetupGet(x => x.Value).Returns(new RetryPolicyOptions { ExponentialBackOffExponent = 1, MaxRetryCount = MaxRetryCount }); - var cachingRetryHandler = new CachingRetryHandler(appCache, mockOptions.Object); + var cachingRetryHandler = new CachingRetryPolicyBuilder(appCache, mockOptions.Object); cachingAsyncHttpResponseMessagePolicy = cachingRetryHandler .DefaultRetryHttpResponseMessagePolicyBuilder() diff --git a/src/Tes.ApiClients.Tests/PriceApiClientTests.cs b/src/Tes.ApiClients.Tests/PriceApiClientTests.cs index 90fc6430f..8ff9d8c11 100644 --- a/src/Tes.ApiClients.Tests/PriceApiClientTests.cs +++ b/src/Tes.ApiClients.Tests/PriceApiClientTests.cs @@ -15,7 +15,7 @@ namespace Tes.ApiClients.Tests public class PriceApiClientTests { private PriceApiClient pricingApiClient = null!; - private CachingRetryHandler cachingRetryHandler = null!; + private CachingRetryPolicyBuilder cachingRetryHandler = null!; private IMemoryCache appCache = null!; [TestInitialize] @@ -24,7 +24,7 @@ public void Initialize() appCache = new MemoryCache(new MemoryCacheOptions()); var options = new Mock>(); options.Setup(o => o.Value).Returns(new RetryPolicyOptions()); - cachingRetryHandler = new CachingRetryHandler(appCache, options.Object); + cachingRetryHandler = new CachingRetryPolicyBuilder(appCache, options.Object); pricingApiClient = new PriceApiClient(cachingRetryHandler, new NullLogger()); } diff --git a/src/Tes.ApiClients.Tests/TerraIntegration/TerraWsmApiClientIntegrationTests.cs b/src/Tes.ApiClients.Tests/TerraIntegration/TerraWsmApiClientIntegrationTests.cs index a5b440231..5790cca5f 100644 --- a/src/Tes.ApiClients.Tests/TerraIntegration/TerraWsmApiClientIntegrationTests.cs +++ b/src/Tes.ApiClients.Tests/TerraIntegration/TerraWsmApiClientIntegrationTests.cs @@ -24,7 +24,7 @@ public void Setup() var memoryCache = new MemoryCache(new MemoryCacheOptions()); wsmApiClient = new TerraWsmApiClient(envInfo.WsmApiHost, new TestEnvTokenCredential(), - new CachingRetryHandler(memoryCache, retryOptions), TestLoggerFactory.Create()); + new CachingRetryPolicyBuilder(memoryCache, retryOptions), TestLoggerFactory.Create()); } diff --git a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs index d71a5f691..feb45c4cd 100644 --- a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs +++ b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs @@ -7,7 +7,6 @@ using Microsoft.Extensions.Logging.Abstractions; using Moq; using Tes.ApiClients.Models.Terra; -using static Tes.ApiClients.CachingRetryHandler; namespace Tes.ApiClients.Tests { @@ -16,8 +15,8 @@ public class TerraWsmApiClientTests { private TerraWsmApiClient terraWsmApiClient = null!; private Mock tokenCredential = null!; - private Mock cacheAndRetryHandler = null!; - private Lazy>> asyncResponseRetryPolicy = null!; + private Mock cacheAndRetryBuilder = null!; + private Lazy>> cacheAndRetryHandler = null!; private TerraApiStubData terraApiStubData = null!; [TestInitialize] @@ -25,13 +24,13 @@ public void SetUp() { terraApiStubData = new TerraApiStubData(); tokenCredential = new Mock(); - cacheAndRetryHandler = new Mock(); + cacheAndRetryBuilder = new Mock(); var cache = new Mock(); cache.Setup(c => c.CreateEntry(It.IsAny())).Returns(new Mock().Object); - cacheAndRetryHandler.SetupGet(c => c.AppCache).Returns(cache.Object); - asyncResponseRetryPolicy = new(TestServices.RetryHandlersHelpers.GetCachingAsyncRetryPolicyMock(cacheAndRetryHandler, c => c.DefaultRetryHttpResponseMessagePolicyBuilder())); + cacheAndRetryBuilder.SetupGet(c => c.AppCache).Returns(cache.Object); + cacheAndRetryHandler = new(TestServices.RetryHandlersHelpers.GetCachingAsyncRetryPolicyMock(cacheAndRetryBuilder, c => c.DefaultRetryHttpResponseMessagePolicyBuilder())); terraWsmApiClient = new TerraWsmApiClient(TerraApiStubData.WsmApiHost, tokenCredential.Object, - cacheAndRetryHandler.Object, NullLogger.Instance); + cacheAndRetryBuilder.Object, NullLogger.Instance); } [TestMethod] @@ -94,7 +93,7 @@ public async Task GetSasTokenAsync_ValidRequest_ReturnsPayload() Content = new StringContent(terraApiStubData.GetWsmSasTokenApiResponseInJson()) }; - asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + cacheAndRetryHandler.Value.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); var apiResponse = await terraWsmApiClient.GetSasTokenAsync(terraApiStubData.WorkspaceId, @@ -113,7 +112,7 @@ public async Task GetContainerResourcesAsync_ValidRequest_ReturnsPayload() Content = new StringContent(terraApiStubData.GetContainerResourcesApiResponseInJson()) }; - asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + cacheAndRetryHandler.Value.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); var apiResponse = await terraWsmApiClient.GetContainerResourcesAsync(terraApiStubData.WorkspaceId, @@ -130,7 +129,7 @@ public async Task DeleteBatchPoolAsync_204Response_Succeeds() var wsmResourceId = Guid.NewGuid(); var response = new HttpResponseMessage(HttpStatusCode.NoContent); - asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) + cacheAndRetryHandler.Value.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(response); await terraWsmApiClient.DeleteBatchPoolAsync(terraApiStubData.WorkspaceId, wsmResourceId, CancellationToken.None); @@ -153,7 +152,7 @@ public async Task GetResourceQuotaAsync_ValidResourceIdReturnsQuotaInformationAn { var body = terraApiStubData.GetResourceQuotaApiResponseInJson(); - asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteWithRetryConversionAndCachingAsync(It.IsAny(), + cacheAndRetryHandler.Value.Setup(c => c.ExecuteWithRetryConversionAndCachingAsync(It.IsAny(), It.IsAny>>(), It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(body); @@ -180,7 +179,7 @@ public async Task GetLandingZoneResourcesAsync_ListOfLandingZoneResourcesAndGets { var body = terraApiStubData.GetResourceApiResponseInJson(); - asyncResponseRetryPolicy.Value.Setup(c => c.ExecuteWithRetryConversionAndCachingAsync(It.IsAny(), + cacheAndRetryHandler.Value.Setup(c => c.ExecuteWithRetryConversionAndCachingAsync(It.IsAny(), It.IsAny>>(), It.IsAny>>(), It.IsAny(), It.IsAny())) .ReturnsAsync(body); @@ -214,6 +213,5 @@ public void GetQuotaApiUrl_CorrectUrlIsParsed() Assert.AreEqual(expectedUrl, url.ToString()); } - } } diff --git a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs index 98fad4711..9da8a8d2e 100644 --- a/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs +++ b/src/Tes.ApiClients.Tests/TestServices/RetryHandlersHelpers.cs @@ -6,7 +6,8 @@ using Moq; using Polly; using static CommonUtilities.RetryHandler; -using static Tes.ApiClients.CachingRetryHandler; +using static CommonUtilities.RetryPolicyBuilder; +using static Tes.ApiClients.CachingRetryPolicyBuilder; namespace Tes.ApiClients.Tests.TestServices { @@ -15,9 +16,9 @@ internal static partial class RetryHandlersHelpers // TODO: Add ability to use a mocked ILogger with a mocked CachingRetryHandler where failures in the mocked retry handlers call the mocked ILogger. // The opt-in would be an optional argument like this: "Microsoft.Extensions.Logging.ILogger logger". - internal static Mock GetCachingAsyncRetryPolicyMock(Mock cachingRetryHandler) + internal static Mock GetCachingAsyncRetryPolicyMock(Mock cachingRetryHandler) { - var cachingAsyncRetryPolicy = new Mock(); + var cachingAsyncRetryPolicy = new Mock(); _ = cachingAsyncRetryPolicy.As(); var cachingPolicyBuild = new Mock(); cachingPolicyBuild.Setup(policy => policy.AsyncBuild()) @@ -43,9 +44,9 @@ internal static Mock GetCachingAsyncRetryPolicyM return cachingAsyncRetryPolicy; } - internal static Mock> GetCachingAsyncRetryPolicyMock(Mock cachingRetryHandler, Expression>> expression) + internal static Mock> GetCachingAsyncRetryPolicyMock(Mock cachingRetryHandler, Expression>> expression) { - var cachingAsyncRetryPolicy = new Mock>(); + var cachingAsyncRetryPolicy = new Mock>(); _ = cachingAsyncRetryPolicy.As>(); var cachingPolicyBuild = new Mock>(); cachingPolicyBuild.Setup(policy => policy.AsyncBuild()) diff --git a/src/Tes.ApiClients/CachingRetryHandler.cs b/src/Tes.ApiClients/CachingRetryHandler.cs index a7bef478d..2e92c178d 100644 --- a/src/Tes.ApiClients/CachingRetryHandler.cs +++ b/src/Tes.ApiClients/CachingRetryHandler.cs @@ -2,42 +2,23 @@ // Licensed under the MIT License. using CommonUtilities; -using CommonUtilities.Options; using Microsoft.Extensions.Caching.Memory; -using Microsoft.Extensions.Options; using Polly; -using Polly.Retry; namespace Tes.ApiClients { /// /// Contains an App Cache instances and retry policies. /// - public partial class CachingRetryHandler : RetryHandler, CachingRetryHandler.ICachingPolicyBuilderHandler + public static class CachingRetryHandler { - private readonly IMemoryCache appCache = null!; - public virtual IMemoryCache AppCache => appCache; - - /// - /// Contains an App Cache instances and retry policies. - /// - /// > - /// - public CachingRetryHandler(IMemoryCache appCache, IOptions retryPolicyOptions) : base(retryPolicyOptions) + public interface ICachingPolicy { - ArgumentNullException.ThrowIfNull(appCache); - - this.appCache = appCache; + IMemoryCache AppCache { get; } } - /// - /// Protected parameter-less constructor for mocking - /// - protected CachingRetryHandler() { } - #region CachingRetryHandlerPolicies - - public class CachingRetryHandlerPolicy : RetryHandlerPolicy, ICachingPolicy + public class CachingRetryHandlerPolicy : RetryHandler.RetryHandlerPolicy, ICachingPolicy { private readonly IMemoryCache appCache; @@ -69,7 +50,7 @@ public TResult ExecuteWithRetryAndCaching(string cacheKey, Func ExecuteWithRetryAndCachingAsync(stri } } - //public class CachingRetryHandlerPolicy : RetryHandlerPolicy, ICachingPolicy + //public class CachingRetryHandlerPolicy : RetryHandler.RetryHandlerPolicy, ICachingPolicy //{ // private readonly IMemoryCache appCache; @@ -136,7 +117,7 @@ public virtual async Task ExecuteWithRetryAndCachingAsync(stri // public virtual IMemoryCache AppCache => appCache; //} - public class CachingAsyncRetryHandlerPolicy : AsyncRetryHandlerPolicy, ICachingPolicy + public class CachingAsyncRetryHandlerPolicy : RetryHandler.AsyncRetryHandlerPolicy, ICachingPolicy { private readonly IMemoryCache appCache; @@ -220,107 +201,7 @@ public virtual async Task ExecuteWithRetryConversionAndCachingAsync(string } #endregion - #region Builder interfaces - public interface ICachingPolicy - { - IMemoryCache AppCache { get; } - } - - public interface ICachingPolicyBuilderBuild - { - /// - /// Builds with caching. - /// - /// Caching retry policy. - CachingRetryHandlerPolicy SyncBuild(); - - /// - /// Builds with caching. - /// - /// Caching retry policy. - CachingAsyncRetryHandlerPolicy AsyncBuild(); - } - - public interface ICachingPolicyBuilderBuild - { - ///// - ///// Builds with caching. - ///// - ///// Caching retry policy. - //CachingRetryHandlerPolicy SyncBuild(); - - /// - /// Builds with caching. - /// - /// Caching retry policy. - CachingAsyncRetryHandlerPolicy AsyncBuild(); - } - - /// Used internally and for testing. - public interface ICachingPolicyBuilderHandler - { - /// Used internally and for testing. - ICachingPolicyBuilderBuild CachingPolicyBuilder(IPolicyBuilderBuild policyBuilder); - /// Used internally and for testing. - ICachingPolicyBuilderBuild CachingPolicyBuilder(IPolicyBuilderBuild policyBuilder); - } - #endregion - - #region Builder interface implementations - ICachingPolicyBuilderBuild ICachingPolicyBuilderHandler.CachingPolicyBuilder(IPolicyBuilderBuild policyBuilder) - => new CachingPolicyBuilderBuild(policyBuilder, this); - - ICachingPolicyBuilderBuild ICachingPolicyBuilderHandler.CachingPolicyBuilder(IPolicyBuilderBuild policyBuilder) - => new CachingPolicyBuilderBuild(policyBuilder, this); - - private readonly struct CachingPolicyBuilderBuild : ICachingPolicyBuilderBuild - { - private readonly IPolicyBuilderBuild policyBuilder; - private readonly CachingRetryHandler cachingHandler; - - public CachingPolicyBuilderBuild(IPolicyBuilderBuild policyBuilder, CachingRetryHandler handler) - { - ArgumentNullException.ThrowIfNull(policyBuilder); - this.policyBuilder = policyBuilder; - this.cachingHandler = handler; - } - - CachingRetryHandlerPolicy ICachingPolicyBuilderBuild.SyncBuild() - { - return new(policyBuilder.SyncBuildPolicy(), cachingHandler.AppCache); - } - - CachingAsyncRetryHandlerPolicy ICachingPolicyBuilderBuild.AsyncBuild() - { - return new(policyBuilder.AsyncBuildPolicy(), cachingHandler.AppCache); - } - } - - private readonly struct CachingPolicyBuilderBuild : ICachingPolicyBuilderBuild - { - private readonly IPolicyBuilderBuild policyBuilder; - private readonly CachingRetryHandler cachingHandler; - - public CachingPolicyBuilderBuild(IPolicyBuilderBuild policyBuilder, CachingRetryHandler handler) - { - ArgumentNullException.ThrowIfNull(policyBuilder); - this.policyBuilder = policyBuilder; - this.cachingHandler = handler; - } - - //CachingRetryHandlerPolicy ICachingPolicyBuilderBuild.SyncBuild() - //{ - // return new(policyBuilder.SyncBuildPolicy(), cachingHandler.AppCache); - //} - - CachingAsyncRetryHandlerPolicy ICachingPolicyBuilderBuild.AsyncBuild() - { - return new(policyBuilder.AsyncBuildPolicy(), cachingHandler.AppCache); - } - } - #endregion - - internal static void ValidateArgs(string cacheKey, Func action) + private static void ValidateArgs(string cacheKey, Func action) { ArgumentNullException.ThrowIfNull(action); @@ -340,30 +221,4 @@ private static async Task ExecuteWithCacheAsync(IMemoryCache a return action(); }); } - - /// - /// Extension methods for - /// - public static class CachingRetryHandlerExtensions - { - /// - /// Default caching policy. - /// - /// policy builder. - /// OnRetry builder - public static CachingRetryHandler.ICachingPolicyBuilderBuild AddCaching(this RetryHandler.IPolicyBuilderBuild policyBuilder) - { - return ((CachingRetryHandler.ICachingPolicyBuilderHandler)policyBuilder.PolicyBuilderBase).CachingPolicyBuilder(policyBuilder); - } - - /// - /// Default caching policy. - /// - /// policy builder. - /// OnRetry builder - public static CachingRetryHandler.ICachingPolicyBuilderBuild AddCaching(this RetryHandler.IPolicyBuilderBuild policyBuilder) - { - return ((CachingRetryHandler.ICachingPolicyBuilderHandler)policyBuilder.PolicyBuilderBase).CachingPolicyBuilder(policyBuilder); - } - } } diff --git a/src/Tes.ApiClients/CachingRetryPolicyBuilder.cs b/src/Tes.ApiClients/CachingRetryPolicyBuilder.cs new file mode 100644 index 000000000..a7daf96df --- /dev/null +++ b/src/Tes.ApiClients/CachingRetryPolicyBuilder.cs @@ -0,0 +1,158 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using CommonUtilities; +using CommonUtilities.Options; +using Microsoft.Extensions.Caching.Memory; +using Microsoft.Extensions.Options; +using Polly.Retry; + +namespace Tes.ApiClients +{ + /// + /// Contains an App Cache instances and retry policies. + /// + public partial class CachingRetryPolicyBuilder : RetryPolicyBuilder, CachingRetryPolicyBuilder.ICachingPolicyBuilderHandler + { + private readonly IMemoryCache appCache = null!; + public virtual IMemoryCache AppCache => appCache; + + /// + /// Contains an App Cache instances and retry policies. + /// + /// > + /// + public CachingRetryPolicyBuilder(IMemoryCache appCache, IOptions retryPolicyOptions) : base(retryPolicyOptions) + { + ArgumentNullException.ThrowIfNull(appCache); + + this.appCache = appCache; + } + + /// + /// Protected parameter-less constructor for mocking + /// + protected CachingRetryPolicyBuilder() { } + + #region Builder interfaces + public interface ICachingPolicyBuilderBuild + { + /// + /// Builds with caching. + /// + /// Caching retry policy. + CachingRetryHandler.CachingRetryHandlerPolicy SyncBuild(); + + /// + /// Builds with caching. + /// + /// Caching retry policy. + CachingRetryHandler.CachingAsyncRetryHandlerPolicy AsyncBuild(); + } + + public interface ICachingPolicyBuilderBuild + { + ///// + ///// Builds with caching. + ///// + ///// Caching retry policy. + //CachingRetryHandler.CachingRetryHandlerPolicy SyncBuild(); + + /// + /// Builds with caching. + /// + /// Caching retry policy. + CachingRetryHandler.CachingAsyncRetryHandlerPolicy AsyncBuild(); + } + + /// Used internally and for testing. + public interface ICachingPolicyBuilderHandler + { + /// Used internally and for testing. + ICachingPolicyBuilderBuild CachingPolicyBuilder(IPolicyBuilderBuild policyBuilder); + /// Used internally and for testing. + ICachingPolicyBuilderBuild CachingPolicyBuilder(IPolicyBuilderBuild policyBuilder); + } + #endregion + + #region Builder interface implementations + ICachingPolicyBuilderBuild ICachingPolicyBuilderHandler.CachingPolicyBuilder(IPolicyBuilderBuild policyBuilder) + => new CachingPolicyBuilderBuild(policyBuilder, this); + + ICachingPolicyBuilderBuild ICachingPolicyBuilderHandler.CachingPolicyBuilder(IPolicyBuilderBuild policyBuilder) + => new CachingPolicyBuilderBuild(policyBuilder, this); + + private readonly struct CachingPolicyBuilderBuild : ICachingPolicyBuilderBuild + { + private readonly IPolicyBuilderBuild policyBuilder; + private readonly CachingRetryPolicyBuilder cachingHandler; + + public CachingPolicyBuilderBuild(IPolicyBuilderBuild policyBuilder, CachingRetryPolicyBuilder handler) + { + ArgumentNullException.ThrowIfNull(policyBuilder); + this.policyBuilder = policyBuilder; + this.cachingHandler = handler; + } + + CachingRetryHandler.CachingRetryHandlerPolicy ICachingPolicyBuilderBuild.SyncBuild() + { + return new(policyBuilder.SyncBuildPolicy(), cachingHandler.AppCache); + } + + CachingRetryHandler.CachingAsyncRetryHandlerPolicy ICachingPolicyBuilderBuild.AsyncBuild() + { + return new(policyBuilder.AsyncBuildPolicy(), cachingHandler.AppCache); + } + } + + private readonly struct CachingPolicyBuilderBuild : ICachingPolicyBuilderBuild + { + private readonly IPolicyBuilderBuild policyBuilder; + private readonly CachingRetryPolicyBuilder cachingHandler; + + public CachingPolicyBuilderBuild(IPolicyBuilderBuild policyBuilder, CachingRetryPolicyBuilder handler) + { + ArgumentNullException.ThrowIfNull(policyBuilder); + this.policyBuilder = policyBuilder; + this.cachingHandler = handler; + } + + //CachingRetryHandler.CachingRetryHandlerPolicy ICachingPolicyBuilderBuild.SyncBuild() + //{ + // return new(policyBuilder.SyncBuildPolicy(), cachingHandler.AppCache); + //} + + CachingRetryHandler.CachingAsyncRetryHandlerPolicy ICachingPolicyBuilderBuild.AsyncBuild() + { + return new(policyBuilder.AsyncBuildPolicy(), cachingHandler.AppCache); + } + } + #endregion + } + + /// + /// Extension methods for + /// + public static class CachingRetryHandlerExtensions + { + /// + /// Default caching policy. + /// + /// policy builder. + /// OnRetry builder + public static CachingRetryPolicyBuilder.ICachingPolicyBuilderBuild AddCaching(this RetryPolicyBuilder.IPolicyBuilderBuild policyBuilder) + { + return ((CachingRetryPolicyBuilder.ICachingPolicyBuilderHandler)policyBuilder.PolicyBuilderBase).CachingPolicyBuilder(policyBuilder); + } + + /// + /// Default caching policy. + /// + /// policy builder. + /// OnRetry builder + public static CachingRetryPolicyBuilder.ICachingPolicyBuilderBuild AddCaching(this RetryPolicyBuilder.IPolicyBuilderBuild policyBuilder) + { + return ((CachingRetryPolicyBuilder.ICachingPolicyBuilderHandler)policyBuilder.PolicyBuilderBase).CachingPolicyBuilder(policyBuilder); + } + } +} diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index c478e07c0..23e7f15e8 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -39,7 +39,7 @@ public abstract class HttpApiClient /// /// /// - protected HttpApiClient(CachingRetryHandler cachingRetryHandler, ILogger logger) + protected HttpApiClient(CachingRetryPolicyBuilder cachingRetryHandler, ILogger logger) { ArgumentNullException.ThrowIfNull(cachingRetryHandler); ArgumentNullException.ThrowIfNull(logger); @@ -61,7 +61,7 @@ protected HttpApiClient(CachingRetryHandler cachingRetryHandler, ILogger logger) /// /// protected HttpApiClient(TokenCredential tokenCredential, string tokenScope, - CachingRetryHandler cachingRetryHandler, ILogger logger) : this(cachingRetryHandler, logger) + CachingRetryPolicyBuilder cachingRetryHandler, ILogger logger) : this(cachingRetryHandler, logger) { ArgumentNullException.ThrowIfNull(tokenCredential); ArgumentException.ThrowIfNullOrEmpty(tokenScope); @@ -78,7 +78,7 @@ protected HttpApiClient() { } /// /// A logging Polly retry handler. /// - /// + /// private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHttpResponseMessageHandler() => new((result, timeSpan, retryCount, correlationId, caller) => { @@ -346,7 +346,7 @@ private string ToHash(string input) /// A for controlling the lifetime of the asynchronous operation. /// Response's content deserialization type /// - protected async Task GetApiResponseContentAsync(HttpResponseMessage response, CancellationToken cancellationToken) + protected static async Task GetApiResponseContentAsync(HttpResponseMessage response, CancellationToken cancellationToken) { response.EnsureSuccessStatusCode(); diff --git a/src/Tes.ApiClients/PriceApiClient.cs b/src/Tes.ApiClients/PriceApiClient.cs index 18e199b5b..dd4d549ef 100644 --- a/src/Tes.ApiClients/PriceApiClient.cs +++ b/src/Tes.ApiClients/PriceApiClient.cs @@ -17,9 +17,9 @@ public class PriceApiClient : HttpApiClient /// /// Constructor of the Price API Client. /// - /// + /// /// - public PriceApiClient(CachingRetryHandler cachingRetryHandler, ILogger logger) : base(cachingRetryHandler, logger) + public PriceApiClient(CachingRetryPolicyBuilder cachingRetryHandler, ILogger logger) : base(cachingRetryHandler, logger) { } diff --git a/src/Tes.ApiClients/TerraApiClient.cs b/src/Tes.ApiClients/TerraApiClient.cs index fe33f5a3d..61636dc7d 100644 --- a/src/Tes.ApiClients/TerraApiClient.cs +++ b/src/Tes.ApiClients/TerraApiClient.cs @@ -24,14 +24,11 @@ protected TerraApiClient() { } /// /// API Host /// - /// + /// /// - protected TerraApiClient(string apiUrl, TokenCredential tokenCredential, CachingRetryHandler cachingRetryHandler, ILogger logger) : base(tokenCredential, TokenScope, cachingRetryHandler, logger) + protected TerraApiClient(string apiUrl, TokenCredential tokenCredential, CachingRetryPolicyBuilder cachingRetryHandler, ILogger logger) : base(tokenCredential, TokenScope, cachingRetryHandler, logger) { ArgumentException.ThrowIfNullOrEmpty(apiUrl); - ArgumentNullException.ThrowIfNull(tokenCredential); - ArgumentNullException.ThrowIfNull(cachingRetryHandler); - ArgumentNullException.ThrowIfNull(logger); ApiUrl = apiUrl; } diff --git a/src/Tes.ApiClients/TerraWsmApiClient.cs b/src/Tes.ApiClients/TerraWsmApiClient.cs index 42be2802a..90ef71a5c 100644 --- a/src/Tes.ApiClients/TerraWsmApiClient.cs +++ b/src/Tes.ApiClients/TerraWsmApiClient.cs @@ -28,7 +28,7 @@ public class TerraWsmApiClient : TerraApiClient /// /// /// - public TerraWsmApiClient(string apiUrl, TokenCredential tokenCredential, CachingRetryHandler cachingRetryHandler, + public TerraWsmApiClient(string apiUrl, TokenCredential tokenCredential, CachingRetryPolicyBuilder cachingRetryHandler, ILogger logger) : base(apiUrl, tokenCredential, cachingRetryHandler, logger) { @@ -37,7 +37,7 @@ public TerraWsmApiClient(string apiUrl, TokenCredential tokenCredential, Caching public static TerraWsmApiClient CreateTerraWsmApiClient(string apiUrl, TokenCredential tokenCredential) { var retryPolicyOptions = new RetryPolicyOptions(); - var cacheRetryHandler = new CachingRetryHandler(sharedMemoryCache, + var cacheRetryHandler = new CachingRetryPolicyBuilder(sharedMemoryCache, Microsoft.Extensions.Options.Options.Create(retryPolicyOptions)); return new TerraWsmApiClient(apiUrl, tokenCredential, cacheRetryHandler, ApiClientsLoggerFactory.Create()); @@ -112,7 +112,7 @@ public virtual async Task CreateBatchPool(Guid works { var uri = GetCreateBatchPoolUrl(workspaceId); - Logger.LogInformation($"Creating a batch pool using WSM for workspace: {workspaceId}"); + Logger.LogInformation(@"Creating a batch pool using WSM for workspace: {WorkspaceId}", workspaceId); response = await HttpSendRequestWithRetryPolicyAsync(() => new HttpRequestMessage(HttpMethod.Post, uri) { Content = GetBatchPoolRequestContent(apiCreateBatchPool) }, @@ -120,7 +120,7 @@ public virtual async Task CreateBatchPool(Guid works var apiResponse = await GetApiResponseContentAsync(response, cancellationToken); - Logger.LogInformation($"Successfully created a batch pool using WSM for workspace: {workspaceId}"); + Logger.LogInformation(@"Successfully created a batch pool using WSM for workspace: {WorkspaceId}", workspaceId); return apiResponse; } @@ -147,7 +147,7 @@ public virtual async Task DeleteBatchPoolAsync(Guid workspaceId, Guid wsmBatchPo { var uri = GetDeleteBatchPoolUrl(workspaceId, wsmBatchPoolResourceId); - Logger.LogInformation($"Deleting the Batch pool using WSM for workspace: {workspaceId} WSM resource ID: {wsmBatchPoolResourceId}"); + Logger.LogInformation(@"Deleting the Batch pool using WSM for workspace: {WorkspaceId} WSM resource ID: {WsmBatchPoolResourceId}", workspaceId, wsmBatchPoolResourceId); response = await HttpSendRequestWithRetryPolicyAsync(() => new HttpRequestMessage(HttpMethod.Delete, uri), @@ -155,7 +155,7 @@ public virtual async Task DeleteBatchPoolAsync(Guid workspaceId, Guid wsmBatchPo response.EnsureSuccessStatusCode(); - Logger.LogInformation($"Successfully deleted Batch pool, WSM resource ID: {wsmBatchPoolResourceId} using WSM for workspace: {workspaceId}"); + Logger.LogInformation(@"Successfully deleted Batch pool, WSM resource ID: {WsmBatchPoolResourceId} using WSM for workspace: {WorkspaceId}", wsmBatchPoolResourceId, workspaceId); } catch (Exception ex) { @@ -248,7 +248,7 @@ private async Task LogResponseContentAsync(HttpResponseMessage response, string responseContent = await ReadResponseBodyAsync(response, cancellationToken); } - Logger.LogError(ex, $"{errMessage}. Response content:{responseContent}"); + Logger.LogError(ex, @"{ErrorMessage}. Response content:{ResponseContent}", errMessage, responseContent); } private string GetCreateBatchPoolUrl(Guid workspaceId) diff --git a/src/Tes.Runner/Docker/DockerExecutor.cs b/src/Tes.Runner/Docker/DockerExecutor.cs index e973527e6..18f313c92 100644 --- a/src/Tes.Runner/Docker/DockerExecutor.cs +++ b/src/Tes.Runner/Docker/DockerExecutor.cs @@ -11,16 +11,16 @@ using Tes.Runner.Logs; using Tes.Runner.Models; using Tes.Runner.Transfer; +using static CommonUtilities.RetryHandler; namespace Tes.Runner.Docker { public class DockerExecutor { - private readonly IDockerClient dockerClient = null!; private readonly ILogger logger = PipelineLoggerFactory.Create(); private readonly NetworkUtility networkUtility = new(); - private readonly RetryHandler.AsyncRetryHandlerPolicy asyncRetryPolicy = null!; + private readonly AsyncRetryHandlerPolicy retryHandler = null!; private readonly IStreamLogReader streamLogReader = null!; private readonly ContainerRegistryAuthorizationManager containerRegistryAuthorizationManager = null!; @@ -48,8 +48,8 @@ public DockerExecutor(IDockerClient dockerClient, IStreamLogReader streamLogRead /// protected DockerExecutor() { - this.asyncRetryPolicy = - new RetryHandler(Options.Create(new RetryPolicyOptions())) + this.retryHandler = + new RetryPolicyBuilder(Options.Create(new RetryPolicyOptions())) .DefaultRetryPolicyBuilder().SetOnRetryBehavior(logger).AsyncBuild(); } @@ -141,7 +141,7 @@ private async Task PullImageWithRetriesAsync(string imageName, string? tag, Auth { logger.LogInformation($"Pulling image name: {imageName} image tag: {tag}"); - await asyncRetryPolicy.ExecuteWithRetryAsync(async () => + await retryHandler.ExecuteWithRetryAsync(async () => { await dockerClient.Images.CreateImageAsync( new ImagesCreateParameters() { FromImage = imageName, Tag = tag }, diff --git a/src/TesApi.Tests/PriceApiBatchSkuInformationProviderTests.cs b/src/TesApi.Tests/PriceApiBatchSkuInformationProviderTests.cs index 63afbfe8e..86e23d379 100644 --- a/src/TesApi.Tests/PriceApiBatchSkuInformationProviderTests.cs +++ b/src/TesApi.Tests/PriceApiBatchSkuInformationProviderTests.cs @@ -20,7 +20,7 @@ public class PriceApiBatchSkuInformationProviderTests private PriceApiClient pricingApiClient; private PriceApiBatchSkuInformationProvider provider; private IMemoryCache appCache; - private CachingRetryHandler cachingRetryHandler; + private CachingRetryPolicyBuilder cachingRetryHandler; private Mock> mockRetryOptions; [TestInitialize] @@ -30,7 +30,7 @@ public void Initialize() mockRetryOptions = new Mock>(); mockRetryOptions.Setup(m => m.Value).Returns(new RetryPolicyOptions()); - cachingRetryHandler = new CachingRetryHandler(appCache, mockRetryOptions.Object); + cachingRetryHandler = new CachingRetryPolicyBuilder(appCache, mockRetryOptions.Object); pricingApiClient = new PriceApiClient(cachingRetryHandler, new NullLogger()); provider = new PriceApiBatchSkuInformationProvider(pricingApiClient, new NullLogger()); } diff --git a/src/TesApi.Tests/TestServices/TestServiceProvider.cs b/src/TesApi.Tests/TestServices/TestServiceProvider.cs index 2f76f0087..6b49d8f80 100644 --- a/src/TesApi.Tests/TestServices/TestServiceProvider.cs +++ b/src/TesApi.Tests/TestServices/TestServiceProvider.cs @@ -80,7 +80,7 @@ internal TestServiceProvider( .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) - .AddSingleton() + .AddSingleton() .AddSingleton() .AddSingleton() .AddTransient() diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 5cb808d75..570657fb2 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -20,13 +20,13 @@ using Microsoft.WindowsAzure.Storage.Blob; using Newtonsoft.Json; using Polly; -using Tes.ApiClients; using Tes.Models; using TesApi.Web.Extensions; using TesApi.Web.Management; using TesApi.Web.Management.Batch; using TesApi.Web.Management.Configuration; using TesApi.Web.Storage; +using static CommonUtilities.RetryHandler; using BatchModels = Microsoft.Azure.Management.Batch.Models; using CloudTask = Microsoft.Azure.Batch.CloudTask; using ComputeNodeState = Microsoft.Azure.Batch.Common.ComputeNodeState; @@ -45,8 +45,8 @@ namespace TesApi.Web public partial class AzureProxy : IAzureProxy { private const char BatchJobAttemptSeparator = '-'; - private readonly RetryHandler.AsyncRetryHandlerPolicy batchRetryPolicyWhenJobNotFound; - private readonly RetryHandler.AsyncRetryHandlerPolicy batchRetryPolicyWhenNodeNotReady; + private readonly AsyncRetryHandlerPolicy batchRetryPolicyWhenJobNotFound; + private readonly AsyncRetryHandlerPolicy batchRetryPolicyWhenNodeNotReady; private readonly ILogger logger; private readonly BatchClient batchClient; @@ -64,7 +64,7 @@ public partial class AzureProxy : IAzureProxy /// Retry builder /// The logger /// - public AzureProxy(IOptions batchAccountOptions, BatchAccountResourceInformation batchAccountInformation, IBatchPoolManager batchPoolManager, RetryHandler retryHandler, ILogger logger) + public AzureProxy(IOptions batchAccountOptions, BatchAccountResourceInformation batchAccountInformation, IBatchPoolManager batchPoolManager, RetryPolicyBuilder retryHandler, ILogger logger) { ArgumentNullException.ThrowIfNull(batchAccountOptions); ArgumentNullException.ThrowIfNull(batchAccountInformation); @@ -111,8 +111,8 @@ public AzureProxy(IOptions batchAccountOptions, BatchAccoun /// /// A logging retry handler. /// - /// - private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHandler() + /// + private OnRetryHandler LogRetryErrorOnRetryHandler() => new((exception, timeSpan, retryCount, correlationId, caller) => { var requestId = (exception as BatchException)?.RequestInformation?.ServiceRequestId ?? "n/a"; diff --git a/src/TesApi.Web/BatchScheduler.BatchPools.cs b/src/TesApi.Web/BatchScheduler.BatchPools.cs index bc090c1ba..be0ff21ae 100644 --- a/src/TesApi.Web/BatchScheduler.BatchPools.cs +++ b/src/TesApi.Web/BatchScheduler.BatchPools.cs @@ -10,10 +10,10 @@ using System.Text.RegularExpressions; using System.Threading; using System.Threading.Tasks; +using CommonUtilities; using Microsoft.Azure.Batch; using Microsoft.Azure.Batch.Common; using Microsoft.Extensions.Logging; -using Tes.Extensions; using Tes.Models; using static TesApi.Web.BatchScheduler.BatchPools; using BatchModels = Microsoft.Azure.Management.Batch.Models; @@ -44,7 +44,7 @@ public partial class BatchScheduler // Generate hash of everything that differentiates this group of pools var displayName = $"{label}:{vmSize}:{isPreemptable}:{identityResourceIds}"; - var hash = CommonUtilities.Base32.ConvertToBase32(SHA1.HashData(Encoding.UTF8.GetBytes(displayName))).TrimEnd('=').ToLowerInvariant(); // This becomes 32 chars + var hash = SHA1.HashData(Encoding.UTF8.GetBytes(displayName)).ConvertToBase32().TrimEnd('=').ToLowerInvariant(); // This becomes 32 chars // Build a PoolName that is of legal length, while exposing the most important metadata without requiring user to find DisplayName // Note that the hash covers all necessary parts to make name unique, so limiting the size of the other parts is not expected to appreciably change the risk of collisions. Those other parts are for convenience @@ -152,7 +152,7 @@ internal async Task GetOrAddPoolAsync(string key, bool isPreemptable var uniquifier = new byte[5]; // This always becomes 8 chars when converted to base32 RandomNumberGenerator.Fill(uniquifier); - var poolId = $"{key}-{CommonUtilities.Base32.ConvertToBase32(uniquifier).TrimEnd('=').ToLowerInvariant()}"; // embedded '-' is required by GetKeyFromPoolId() + var poolId = $"{key}-{uniquifier.ConvertToBase32().TrimEnd('=').ToLowerInvariant()}"; // embedded '-' is required by GetKeyFromPoolId() var modelPool = await modelPoolFactory(poolId, cancellationToken); modelPool.Metadata ??= new List(); modelPool.Metadata.Add(new(PoolHostName, this.batchPrefix)); diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index 67295475a..23fcf004a 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -13,6 +13,7 @@ using Microsoft.Extensions.Logging; using Tes.ApiClients; using TesApi.Web.Storage; +using static Tes.ApiClients.CachingRetryHandler; using BatchModels = Microsoft.Azure.Management.Batch.Models; namespace TesApi.Web @@ -24,10 +25,10 @@ public class CachingWithRetriesAzureProxy : IAzureProxy { private readonly ILogger logger; private readonly IAzureProxy azureProxy; - private readonly CachingRetryHandler.CachingRetryHandlerPolicy cachingRetry; - private readonly CachingRetryHandler.CachingAsyncRetryHandlerPolicy cachingAsyncRetry; - private readonly CachingRetryHandler.CachingAsyncRetryHandlerPolicy cachingAsyncRetryExceptWhenExists; - private readonly CachingRetryHandler.CachingAsyncRetryHandlerPolicy cachingAsyncRetryExceptWhenNotFound; + private readonly CachingRetryHandlerPolicy cachingRetry; + private readonly CachingAsyncRetryHandlerPolicy cachingAsyncRetry; + private readonly CachingAsyncRetryHandlerPolicy cachingAsyncRetryExceptWhenExists; + private readonly CachingAsyncRetryHandlerPolicy cachingAsyncRetryExceptWhenNotFound; /// /// Contructor to create a cache of @@ -35,7 +36,7 @@ public class CachingWithRetriesAzureProxy : IAzureProxy /// /// /// - public CachingWithRetriesAzureProxy(IAzureProxy azureProxy, CachingRetryHandler cachingRetryHandler, ILogger logger) + public CachingWithRetriesAzureProxy(IAzureProxy azureProxy, CachingRetryPolicyBuilder cachingRetryHandler, ILogger logger) { ArgumentNullException.ThrowIfNull(azureProxy); ArgumentNullException.ThrowIfNull(cachingRetryHandler); diff --git a/src/TesApi.Web/Startup.cs b/src/TesApi.Web/Startup.cs index e07b33533..4ffa6f4c8 100644 --- a/src/TesApi.Web/Startup.cs +++ b/src/TesApi.Web/Startup.cs @@ -97,8 +97,8 @@ public void ConfigureServices(IServiceCollection services) .AddSingleton>(sp => ActivatorUtilities.CreateInstance>(sp, (IRepository)sp.GetRequiredService(typeof(TesTaskPostgreSqlRepository)))) .AddAutoMapper(typeof(MappingProfilePoolToWsmRequest)) - .AddSingleton() - .AddSingleton(s => s.GetRequiredService()) + .AddSingleton() + .AddSingleton(s => s.GetRequiredService()) // Return the already declared retry policy builder .AddSingleton() .AddSingleton() .AddSingleton() diff --git a/src/deploy-tes-on-azure/Deployer.cs b/src/deploy-tes-on-azure/Deployer.cs index 2d29e4549..7136c63be 100644 --- a/src/deploy-tes-on-azure/Deployer.cs +++ b/src/deploy-tes-on-azure/Deployer.cs @@ -367,7 +367,7 @@ await Execute("Connecting to Azure Services...", async () => { var blob = new byte[5]; RandomNumberGenerator.Fill(blob); - configuration.BatchPrefix = Base32.ConvertToBase32(blob).TrimEnd('='); + configuration.BatchPrefix = blob.ConvertToBase32().TrimEnd('='); } ValidateRegionName(configuration.RegionName); From cbeffbb021bc72fc1e07923b592b3b42380a2d7b Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 18 Dec 2023 19:47:50 -0800 Subject: [PATCH 111/202] Improve code reuse --- src/Tes.ApiClients/CachingRetryHandler.cs | 36 ++++++++++++----------- src/Tes.ApiClients/HttpApiClient.cs | 30 ++++++++++++++++--- src/Tes.ApiClients/TerraWsmApiClient.cs | 8 ++--- 3 files changed, 47 insertions(+), 27 deletions(-) diff --git a/src/Tes.ApiClients/CachingRetryHandler.cs b/src/Tes.ApiClients/CachingRetryHandler.cs index 2e92c178d..24f5bb599 100644 --- a/src/Tes.ApiClients/CachingRetryHandler.cs +++ b/src/Tes.ApiClients/CachingRetryHandler.cs @@ -35,19 +35,19 @@ public CachingRetryHandlerPolicy() { } /// public virtual IMemoryCache AppCache => appCache; - /// - /// Executes a delegate with the specified policy. - /// - /// Synchronous retry policy. - /// Action to execute. - /// Name of method originating the retriable operation. - /// instance. - public TResult ExecuteWithRetryAndCaching(string cacheKey, Func action, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) - { - ArgumentNullException.ThrowIfNull(action); - - return appCache.GetOrCreate(cacheKey, _ => ExecuteWithRetry(action, caller)); - } + ///// + ///// Executes a delegate with the specified policy. + ///// + ///// Synchronous retry policy. + ///// Action to execute. + ///// Name of method originating the retriable operation. + ///// instance. + //public TResult ExecuteWithRetryAndCaching(string cacheKey, Func action, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) + //{ + // ArgumentNullException.ThrowIfNull(action); + + // return appCache.GetOrCreate(cacheKey, _ => ExecuteWithRetry(action, caller)); + //} } public class CachingAsyncRetryHandlerPolicy : RetryHandler.AsyncRetryHandlerPolicy, ICachingPolicy @@ -89,13 +89,14 @@ public virtual async Task ExecuteWithRetryAndCachingAsync(stri /// Action to execute /// /// A for controlling the lifetime of the asynchronous operation. + /// Name of method originating the retriable operation. /// /// - public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, DateTimeOffset cachesExpires, CancellationToken cancellationToken) + public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, DateTimeOffset cachesExpires, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) { ValidateArgs(cacheKey, action); - return await ExecuteWithCacheAsync(appCache, cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken), cachesExpires); + return await ExecuteWithCacheAsync(appCache, cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken, caller), cachesExpires); } } @@ -156,12 +157,13 @@ public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheK /// Action to execute /// /// A for controlling the lifetime of the asynchronous operation. + /// Name of method originating the retriable operation. /// - public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, DateTimeOffset cachesExpires, CancellationToken cancellationToken) + public virtual async Task ExecuteWithRetryAndCachingAsync(string cacheKey, Func> action, DateTimeOffset cachesExpires, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string caller = default) { ValidateArgs(cacheKey, action); - return await ExecuteWithCacheAsync(appCache, cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken), cachesExpires); + return await ExecuteWithCacheAsync(appCache, cacheKey, () => ExecuteWithRetryAsync(action, cancellationToken, caller), cachesExpires); } /// diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index 23e7f15e8..7a485d335 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -117,6 +117,30 @@ protected async Task HttpSendRequestWithRetryPolicyAsync( }, cancellationToken); } + /// + /// Sends request with a retry policy + /// + /// Response's content deserialization type + /// Factory that creates new http requests, in the event of retry the factory is called again + /// and must be idempotent. + /// A for controlling the lifetime of the asynchronous operation. + /// If true, the authentication header is set with an authentication token + /// + protected async Task HttpSendRequestWithRetryPolicyAsync( + Func httpRequestFactory, CancellationToken cancellationToken, bool setAuthorizationHeader = false) + { + return await AsyncCachingHttpResponseMessageRetryPolicy.ExecuteWithRetryAndConversionAsync(async ct => + { + var request = httpRequestFactory(); + if (setAuthorizationHeader) + { + await AddAuthorizationHeaderToRequestAsync(request, ct); + } + + return await HttpClient.SendAsync(request, ct); + }, GetApiResponseContentAsync, cancellationToken); + } + /// /// Sends a Http Get request to the URL and returns body response as string /// @@ -186,16 +210,14 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri re protected async Task HttpGetRequestWithRetryPolicyAsync(Uri requestUrl, CancellationToken cancellationToken, bool setAuthorizationHeader = false) { - var response = await AsyncCachingHttpResponseMessageRetryPolicy.ExecuteWithRetryAsync(async ct => + return await AsyncCachingHttpResponseMessageRetryPolicy.ExecuteWithRetryAndConversionAsync(async ct => { //request must be recreated in every retry. var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); var httpResponse = await HttpClient.SendAsync(httpRequest, ct); return httpResponse.EnsureSuccessStatusCode(); - }, cancellationToken); - - return await ReadResponseBodyAsync(response, cancellationToken); + }, ReadResponseBodyAsync, cancellationToken); } /// diff --git a/src/Tes.ApiClients/TerraWsmApiClient.cs b/src/Tes.ApiClients/TerraWsmApiClient.cs index 90ef71a5c..bc35314db 100644 --- a/src/Tes.ApiClients/TerraWsmApiClient.cs +++ b/src/Tes.ApiClients/TerraWsmApiClient.cs @@ -60,10 +60,8 @@ public virtual async Task GetContainerResourc { var url = GetContainerResourcesApiUrl(workspaceId, offset, limit); - var response = await HttpSendRequestWithRetryPolicyAsync(() => new HttpRequestMessage(HttpMethod.Get, url), + return await HttpSendRequestWithRetryPolicyAsync(() => new HttpRequestMessage(HttpMethod.Get, url), cancellationToken, setAuthorizationHeader: true); - - return await GetApiResponseContentAsync(response, cancellationToken); } private Uri GetContainerResourcesApiUrl(Guid workspaceId, int offset, int limit) @@ -90,10 +88,8 @@ public virtual async Task GetSasTokenAsync(Guid workspac { var url = GetSasTokenApiUrl(workspaceId, resourceId, sasTokenApiParameters); - var response = await HttpSendRequestWithRetryPolicyAsync(() => new HttpRequestMessage(HttpMethod.Post, url), + return await HttpSendRequestWithRetryPolicyAsync(() => new HttpRequestMessage(HttpMethod.Post, url), cancellationToken, setAuthorizationHeader: true); - - return await GetApiResponseContentAsync(response, cancellationToken); } /// From 9eff3004742b1e8b12f0a7e0f6035b28618de19c Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 18 Dec 2023 20:12:15 -0800 Subject: [PATCH 112/202] fix unit tests --- .../TerraWsmApiClientTests.cs | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs index feb45c4cd..335f83da3 100644 --- a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs +++ b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs @@ -7,6 +7,7 @@ using Microsoft.Extensions.Logging.Abstractions; using Moq; using Tes.ApiClients.Models.Terra; +using TesApi.Web.Management.Models.Terra; namespace Tes.ApiClients.Tests { @@ -88,13 +89,8 @@ public void GetContainerSasTokenApiUri_WithSomeSasParameters_ReturnsExpectedUriW [TestMethod] public async Task GetSasTokenAsync_ValidRequest_ReturnsPayload() { - var response = new HttpResponseMessage(HttpStatusCode.OK) - { - Content = new StringContent(terraApiStubData.GetWsmSasTokenApiResponseInJson()) - }; - - cacheAndRetryHandler.Value.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) - .ReturnsAsync(response); + cacheAndRetryHandler.Value.Setup(c => c.ExecuteWithRetryAndConversionAsync(It.IsAny>>(), It.IsAny>>(), It.IsAny(), It.IsAny())) + .ReturnsAsync(System.Text.Json.JsonSerializer.Deserialize(terraApiStubData.GetWsmSasTokenApiResponseInJson())!); var apiResponse = await terraWsmApiClient.GetSasTokenAsync(terraApiStubData.WorkspaceId, terraApiStubData.ContainerResourceId, null!, CancellationToken.None); @@ -107,13 +103,8 @@ public async Task GetSasTokenAsync_ValidRequest_ReturnsPayload() [TestMethod] public async Task GetContainerResourcesAsync_ValidRequest_ReturnsPayload() { - var response = new HttpResponseMessage(HttpStatusCode.OK) - { - Content = new StringContent(terraApiStubData.GetContainerResourcesApiResponseInJson()) - }; - - cacheAndRetryHandler.Value.Setup(c => c.ExecuteWithRetryAsync(It.IsAny>>(), It.IsAny(), It.IsAny())) - .ReturnsAsync(response); + cacheAndRetryHandler.Value.Setup(c => c.ExecuteWithRetryAndConversionAsync(It.IsAny>>(), It.IsAny>>(), It.IsAny(), It.IsAny())) + .ReturnsAsync(System.Text.Json.JsonSerializer.Deserialize(terraApiStubData.GetContainerResourcesApiResponseInJson())!); var apiResponse = await terraWsmApiClient.GetContainerResourcesAsync(terraApiStubData.WorkspaceId, offset: 0, limit: 10, CancellationToken.None); From 9a8cee8338254b27fd73b733560e939c9a0d6702 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 18 Dec 2023 20:26:32 -0800 Subject: [PATCH 113/202] Reduce noise in PR --- src/Tes.ApiClients/HttpApiClient.cs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index 7a485d335..adde0997c 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -27,7 +27,7 @@ public abstract class HttpApiClient private readonly SemaphoreSlim semaphore = new(1, 1); private AccessToken accessToken; - protected readonly CachingRetryHandler.CachingAsyncRetryHandlerPolicy AsyncCachingHttpResponseMessageRetryPolicy; + protected readonly CachingRetryHandler.CachingAsyncRetryHandlerPolicy cachingRetryHandler; /// /// Inner http client. @@ -37,16 +37,16 @@ public abstract class HttpApiClient /// /// Constructor of base HttpApiClient /// - /// + /// /// - protected HttpApiClient(CachingRetryPolicyBuilder cachingRetryHandler, ILogger logger) + protected HttpApiClient(CachingRetryPolicyBuilder cachingRetryBuilder, ILogger logger) { - ArgumentNullException.ThrowIfNull(cachingRetryHandler); + ArgumentNullException.ThrowIfNull(cachingRetryBuilder); ArgumentNullException.ThrowIfNull(logger); this.Logger = logger; - AsyncCachingHttpResponseMessageRetryPolicy = cachingRetryHandler + cachingRetryHandler = cachingRetryBuilder .DefaultRetryHttpResponseMessagePolicyBuilder() .SetOnRetryBehavior(onRetry: LogRetryErrorOnRetryHttpResponseMessageHandler()) .AddCaching() @@ -105,7 +105,7 @@ private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHtt protected async Task HttpSendRequestWithRetryPolicyAsync( Func httpRequestFactory, CancellationToken cancellationToken, bool setAuthorizationHeader = false) { - return await AsyncCachingHttpResponseMessageRetryPolicy.ExecuteWithRetryAsync(async ct => + return await cachingRetryHandler.ExecuteWithRetryAsync(async ct => { var request = httpRequestFactory(); if (setAuthorizationHeader) @@ -129,7 +129,7 @@ protected async Task HttpSendRequestWithRetryPolicyAsync( protected async Task HttpSendRequestWithRetryPolicyAsync( Func httpRequestFactory, CancellationToken cancellationToken, bool setAuthorizationHeader = false) { - return await AsyncCachingHttpResponseMessageRetryPolicy.ExecuteWithRetryAndConversionAsync(async ct => + return await cachingRetryHandler.ExecuteWithRetryAndConversionAsync(async ct => { var request = httpRequestFactory(); if (setAuthorizationHeader) @@ -188,7 +188,7 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri re { var cacheKey = await ToCacheKeyAsync(requestUrl, setAuthorizationHeader, cancellationToken); - return (await AsyncCachingHttpResponseMessageRetryPolicy.ExecuteWithRetryConversionAndCachingAsync(cacheKey, + return (await cachingRetryHandler.ExecuteWithRetryConversionAndCachingAsync(cacheKey, async ct => { //request must be recreated in every retry. @@ -210,7 +210,7 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri re protected async Task HttpGetRequestWithRetryPolicyAsync(Uri requestUrl, CancellationToken cancellationToken, bool setAuthorizationHeader = false) { - return await AsyncCachingHttpResponseMessageRetryPolicy.ExecuteWithRetryAndConversionAsync(async ct => + return await cachingRetryHandler.ExecuteWithRetryAndConversionAsync(async ct => { //request must be recreated in every retry. var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); From 42769ff1006676f837b8152fe0ef6463aa0e57bc Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 18 Dec 2023 20:35:58 -0800 Subject: [PATCH 114/202] Add comments and reduce noise in PR --- src/Tes.ApiClients/HttpApiClient.cs | 42 +++++++++++++++-------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index adde0997c..f8daa8031 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -104,18 +104,16 @@ private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHtt /// protected async Task HttpSendRequestWithRetryPolicyAsync( Func httpRequestFactory, CancellationToken cancellationToken, bool setAuthorizationHeader = false) - { - return await cachingRetryHandler.ExecuteWithRetryAsync(async ct => - { - var request = httpRequestFactory(); - if (setAuthorizationHeader) + => await cachingRetryHandler.ExecuteWithRetryAsync(async ct => { - await AddAuthorizationHeaderToRequestAsync(request, ct); - } + var request = httpRequestFactory(); + if (setAuthorizationHeader) + { + await AddAuthorizationHeaderToRequestAsync(request, ct); + } - return await HttpClient.SendAsync(request, ct); - }, cancellationToken); - } + return await HttpClient.SendAsync(request, ct); + }, cancellationToken); /// /// Sends request with a retry policy @@ -209,16 +207,14 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri re /// protected async Task HttpGetRequestWithRetryPolicyAsync(Uri requestUrl, CancellationToken cancellationToken, bool setAuthorizationHeader = false) - { - return await cachingRetryHandler.ExecuteWithRetryAndConversionAsync(async ct => - { - //request must be recreated in every retry. - var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); + => await cachingRetryHandler.ExecuteWithRetryAndConversionAsync(async ct => + { + //request must be recreated in every retry. + var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); - var httpResponse = await HttpClient.SendAsync(httpRequest, ct); - return httpResponse.EnsureSuccessStatusCode(); - }, ReadResponseBodyAsync, cancellationToken); - } + var httpResponse = await HttpClient.SendAsync(httpRequest, ct); + return httpResponse.EnsureSuccessStatusCode(); + }, ReadResponseBodyAsync, cancellationToken); /// /// Returns an query string key-value, with the value escaped. If the value is null or empty returns an empty string @@ -277,6 +273,12 @@ private async Task CreateGetHttpRequest(Uri requestUrl, bool return httpRequest; } + /// + /// Returns the response content, the response is successful + /// + /// Response + /// A for controlling the lifetime of the asynchronous operation. + /// protected static async Task ReadResponseBodyAsync(HttpResponseMessage response, CancellationToken cancellationToken) { return await response.Content.ReadAsStringAsync(cancellationToken); @@ -362,7 +364,7 @@ private string ToHash(string input) } /// - /// Returns the response content, the response is successful + /// Returns the response content, the response is successful /// /// Response /// A for controlling the lifetime of the asynchronous operation. From 3840fefb003d7ff98ce8feead7bde89a1ed1617e Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 18 Dec 2023 21:39:44 -0800 Subject: [PATCH 115/202] fix unit tests --- .../TerraWsmApiClientTests.cs | 12 +-- src/Tes.ApiClients/HttpApiClient.cs | 79 ++++++++----------- src/Tes.ApiClients/TerraWsmApiClient.cs | 4 +- 3 files changed, 40 insertions(+), 55 deletions(-) diff --git a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs index 335f83da3..8124e5a6d 100644 --- a/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs +++ b/src/Tes.ApiClients.Tests/TerraWsmApiClientTests.cs @@ -141,11 +141,9 @@ public void GetDeleteBatchPoolUrl_ValidWorkspaceAndResourceId_ValidWSMUrl() [TestMethod] public async Task GetResourceQuotaAsync_ValidResourceIdReturnsQuotaInformationAndGetsAuthToken() { - var body = terraApiStubData.GetResourceQuotaApiResponseInJson(); - cacheAndRetryHandler.Value.Setup(c => c.ExecuteWithRetryConversionAndCachingAsync(It.IsAny(), - It.IsAny>>(), It.IsAny>>(), It.IsAny(), It.IsAny())) - .ReturnsAsync(body); + It.IsAny>>(), It.IsAny>>(), It.IsAny(), It.IsAny())) + .ReturnsAsync(System.Text.Json.JsonSerializer.Deserialize(terraApiStubData.GetResourceQuotaApiResponseInJson())!); var quota = await terraWsmApiClient.GetResourceQuotaAsync(terraApiStubData.WorkspaceId, terraApiStubData.BatchAccountId, cacheResults: true, cancellationToken: CancellationToken.None); @@ -168,11 +166,9 @@ public async Task GetResourceQuotaAsync_ValidResourceIdReturnsQuotaInformationAn [TestMethod] public async Task GetLandingZoneResourcesAsync_ListOfLandingZoneResourcesAndGetsAuthToken() { - var body = terraApiStubData.GetResourceApiResponseInJson(); - cacheAndRetryHandler.Value.Setup(c => c.ExecuteWithRetryConversionAndCachingAsync(It.IsAny(), - It.IsAny>>(), It.IsAny>>(), It.IsAny(), It.IsAny())) - .ReturnsAsync(body); + It.IsAny>>(), It.IsAny>>(), It.IsAny(), It.IsAny())) + .ReturnsAsync(System.Text.Json.JsonSerializer.Deserialize(terraApiStubData.GetResourceApiResponseInJson())!); var resources = await terraWsmApiClient.GetLandingZoneResourcesAsync(terraApiStubData.WorkspaceId, CancellationToken.None); diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index f8daa8031..fc450b5d0 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -116,72 +116,61 @@ protected async Task HttpSendRequestWithRetryPolicyAsync( }, cancellationToken); /// - /// Sends request with a retry policy - /// - /// Response's content deserialization type - /// Factory that creates new http requests, in the event of retry the factory is called again - /// and must be idempotent. - /// A for controlling the lifetime of the asynchronous operation. - /// If true, the authentication header is set with an authentication token - /// - protected async Task HttpSendRequestWithRetryPolicyAsync( - Func httpRequestFactory, CancellationToken cancellationToken, bool setAuthorizationHeader = false) - { - return await cachingRetryHandler.ExecuteWithRetryAndConversionAsync(async ct => - { - var request = httpRequestFactory(); - if (setAuthorizationHeader) - { - await AddAuthorizationHeaderToRequestAsync(request, ct); - } - - return await HttpClient.SendAsync(request, ct); - }, GetApiResponseContentAsync, cancellationToken); - } - - /// - /// Sends a Http Get request to the URL and returns body response as string + /// Sends a Http Get request to the URL and deserializes the body response to the specified type /// + /// Response's content deserialization type /// /// /// /// A for controlling the lifetime of the asynchronous operation. + /// /// - protected async Task HttpGetRequestAsync(Uri requestUrl, bool setAuthorizationHeader, bool cacheResults, CancellationToken cancellationToken) + protected async Task HttpGetRequestAsync(Uri requestUrl, bool setAuthorizationHeader, + bool cacheResults, CancellationToken cancellationToken) { if (cacheResults) { - return await HttpGetRequestWithCachingAndRetryPolicyAsync(requestUrl, cancellationToken, setAuthorizationHeader); + return await HttpGetRequestWithCachingAndRetryPolicyAsync(requestUrl, cancellationToken, setAuthorizationHeader); } - return await HttpGetRequestWithRetryPolicyAsync(requestUrl, cancellationToken, setAuthorizationHeader); + return await HttpGetRequestWithRetryPolicyAsync(requestUrl, cancellationToken, setAuthorizationHeader); } /// - /// Sends a Http Get request to the URL and deserializes the body response to the specified type + /// Sends an Http request to the URL and deserializes the body response to the specified type /// - /// - /// - /// + /// Response's content deserialization type + /// Factory that creates new http requests, in the event of retry the factory is called again + /// and must be idempotent /// A for controlling the lifetime of the asynchronous operation. - /// + /// If true, the authentication header is set with an authentication token /// - protected async Task HttpGetRequestAsync(Uri requestUrl, bool setAuthorizationHeader, - bool cacheResults, CancellationToken cancellationToken) + protected async Task HttpGetRequestWithRetryPolicyAsync( + Func httpRequestFactory, CancellationToken cancellationToken, bool setAuthorizationHeader = false) { - var content = await HttpGetRequestAsync(requestUrl, setAuthorizationHeader, cacheResults, cancellationToken); + return await cachingRetryHandler.ExecuteWithRetryAndConversionAsync(async ct => + { + var request = httpRequestFactory(); - return JsonSerializer.Deserialize(content)!; + if (setAuthorizationHeader) + { + await AddAuthorizationHeaderToRequestAsync(request, ct); + } + + return await HttpClient.SendAsync(request, ct); + }, + GetApiResponseContentAsync, cancellationToken); } /// /// Checks the cache and if the request was not found, sends the GET request with a retry policy /// + /// Response's content deserialization type /// /// A for controlling the lifetime of the asynchronous operation. /// /// - protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri requestUrl, + protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri requestUrl, CancellationToken cancellationToken, bool setAuthorizationHeader = false) { var cacheKey = await ToCacheKeyAsync(requestUrl, setAuthorizationHeader, cancellationToken); @@ -192,29 +181,29 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri re //request must be recreated in every retry. var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); - var httpResponse = await HttpClient.SendAsync(httpRequest, ct); - return httpResponse.EnsureSuccessStatusCode(); + return await HttpClient.SendAsync(httpRequest, ct); }, - ReadResponseBodyAsync, cancellationToken))!; + GetApiResponseContentAsync, cancellationToken))!; } /// /// Get request with retry policy /// + /// Response's content deserialization type /// /// A for controlling the lifetime of the asynchronous operation. /// /// - protected async Task HttpGetRequestWithRetryPolicyAsync(Uri requestUrl, + protected async Task HttpGetRequestWithRetryPolicyAsync(Uri requestUrl, CancellationToken cancellationToken, bool setAuthorizationHeader = false) => await cachingRetryHandler.ExecuteWithRetryAndConversionAsync(async ct => { //request must be recreated in every retry. var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); - var httpResponse = await HttpClient.SendAsync(httpRequest, ct); - return httpResponse.EnsureSuccessStatusCode(); - }, ReadResponseBodyAsync, cancellationToken); + return await HttpClient.SendAsync(httpRequest, ct); + }, + GetApiResponseContentAsync, cancellationToken); /// /// Returns an query string key-value, with the value escaped. If the value is null or empty returns an empty string diff --git a/src/Tes.ApiClients/TerraWsmApiClient.cs b/src/Tes.ApiClients/TerraWsmApiClient.cs index bc35314db..2d9489867 100644 --- a/src/Tes.ApiClients/TerraWsmApiClient.cs +++ b/src/Tes.ApiClients/TerraWsmApiClient.cs @@ -60,7 +60,7 @@ public virtual async Task GetContainerResourc { var url = GetContainerResourcesApiUrl(workspaceId, offset, limit); - return await HttpSendRequestWithRetryPolicyAsync(() => new HttpRequestMessage(HttpMethod.Get, url), + return await HttpGetRequestWithRetryPolicyAsync(() => new HttpRequestMessage(HttpMethod.Get, url), cancellationToken, setAuthorizationHeader: true); } @@ -88,7 +88,7 @@ public virtual async Task GetSasTokenAsync(Guid workspac { var url = GetSasTokenApiUrl(workspaceId, resourceId, sasTokenApiParameters); - return await HttpSendRequestWithRetryPolicyAsync(() => new HttpRequestMessage(HttpMethod.Post, url), + return await HttpGetRequestWithRetryPolicyAsync(() => new HttpRequestMessage(HttpMethod.Post, url), cancellationToken, setAuthorizationHeader: true); } From 14fff0c4fd15b22834a8f0b0ebd07625c2705b8f Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 18 Dec 2023 21:54:44 -0800 Subject: [PATCH 116/202] formatting for PR clarity --- src/Tes.ApiClients/HttpApiClient.cs | 108 ++++++++++++++-------------- 1 file changed, 54 insertions(+), 54 deletions(-) diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index fc450b5d0..15f0566ce 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -105,36 +105,16 @@ private RetryHandler.OnRetryHandler LogRetryErrorOnRetryHtt protected async Task HttpSendRequestWithRetryPolicyAsync( Func httpRequestFactory, CancellationToken cancellationToken, bool setAuthorizationHeader = false) => await cachingRetryHandler.ExecuteWithRetryAsync(async ct => - { - var request = httpRequestFactory(); - if (setAuthorizationHeader) - { - await AddAuthorizationHeaderToRequestAsync(request, ct); - } - - return await HttpClient.SendAsync(request, ct); - }, cancellationToken); - - /// - /// Sends a Http Get request to the URL and deserializes the body response to the specified type - /// - /// Response's content deserialization type - /// - /// - /// - /// A for controlling the lifetime of the asynchronous operation. - /// - /// - protected async Task HttpGetRequestAsync(Uri requestUrl, bool setAuthorizationHeader, - bool cacheResults, CancellationToken cancellationToken) - { - if (cacheResults) { - return await HttpGetRequestWithCachingAndRetryPolicyAsync(requestUrl, cancellationToken, setAuthorizationHeader); - } + var request = httpRequestFactory(); - return await HttpGetRequestWithRetryPolicyAsync(requestUrl, cancellationToken, setAuthorizationHeader); - } + if (setAuthorizationHeader) + { + await AddAuthorizationHeaderToRequestAsync(request, ct); + } + + return await HttpClient.SendAsync(request, ct); + }, cancellationToken); /// /// Sends an Http request to the URL and deserializes the body response to the specified type @@ -162,6 +142,27 @@ protected async Task HttpGetRequestWithRetryPolicyAsync( GetApiResponseContentAsync, cancellationToken); } + /// + /// Sends a Http Get request to the URL and deserializes the body response to the specified type + /// + /// Response's content deserialization type + /// + /// + /// + /// A for controlling the lifetime of the asynchronous operation. + /// + /// + protected async Task HttpGetRequestAsync(Uri requestUrl, bool setAuthorizationHeader, + bool cacheResults, CancellationToken cancellationToken) + { + if (cacheResults) + { + return await HttpGetRequestWithCachingAndRetryPolicyAsync(requestUrl, cancellationToken, setAuthorizationHeader); + } + + return await HttpGetRequestWithRetryPolicyAsync(requestUrl, cancellationToken, setAuthorizationHeader); + } + /// /// Checks the cache and if the request was not found, sends the GET request with a retry policy /// @@ -175,15 +176,14 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync - { - //request must be recreated in every retry. - var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); + return (await cachingRetryHandler.ExecuteWithRetryConversionAndCachingAsync(cacheKey, async ct => + { + //request must be recreated in every retry. + var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); - return await HttpClient.SendAsync(httpRequest, ct); - }, - GetApiResponseContentAsync, cancellationToken))!; + return await HttpClient.SendAsync(httpRequest, ct); + }, + GetApiResponseContentAsync, cancellationToken))!; } /// @@ -196,14 +196,14 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync protected async Task HttpGetRequestWithRetryPolicyAsync(Uri requestUrl, CancellationToken cancellationToken, bool setAuthorizationHeader = false) - => await cachingRetryHandler.ExecuteWithRetryAndConversionAsync(async ct => - { - //request must be recreated in every retry. - var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); + => await cachingRetryHandler.ExecuteWithRetryAndConversionAsync(async ct => + { + //request must be recreated in every retry. + var httpRequest = await CreateGetHttpRequest(requestUrl, setAuthorizationHeader, ct); - return await HttpClient.SendAsync(httpRequest, ct); - }, - GetApiResponseContentAsync, cancellationToken); + return await HttpClient.SendAsync(httpRequest, ct); + }, + GetApiResponseContentAsync, cancellationToken); /// /// Returns an query string key-value, with the value escaped. If the value is null or empty returns an empty string @@ -262,17 +262,6 @@ private async Task CreateGetHttpRequest(Uri requestUrl, bool return httpRequest; } - /// - /// Returns the response content, the response is successful - /// - /// Response - /// A for controlling the lifetime of the asynchronous operation. - /// - protected static async Task ReadResponseBodyAsync(HttpResponseMessage response, CancellationToken cancellationToken) - { - return await response.Content.ReadAsStringAsync(cancellationToken); - } - private async Task AddAuthorizationHeaderToRequestAsync(HttpRequestMessage requestMessage, CancellationToken cancellationToken) { if (string.IsNullOrEmpty(tokenScope)) @@ -365,5 +354,16 @@ protected static async Task GetApiResponseContentAsync(HttpResponseMessage return JsonSerializer.Deserialize(await ReadResponseBodyAsync(response, cancellationToken))!; } + + /// + /// Returns the response content, the response is successful + /// + /// Response + /// A for controlling the lifetime of the asynchronous operation. + /// + protected static async Task ReadResponseBodyAsync(HttpResponseMessage response, CancellationToken cancellationToken) + { + return await response.Content.ReadAsStringAsync(cancellationToken); + } } } From d9c7ee69de11df6f23f85c27027d16d19b720066 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 18 Dec 2023 21:58:45 -0800 Subject: [PATCH 117/202] formatting for PR clarity --- src/Tes.ApiClients/HttpApiClient.cs | 52 ++++++++++++++--------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index 15f0566ce..f9636d47f 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -116,32 +116,6 @@ protected async Task HttpSendRequestWithRetryPolicyAsync( return await HttpClient.SendAsync(request, ct); }, cancellationToken); - /// - /// Sends an Http request to the URL and deserializes the body response to the specified type - /// - /// Response's content deserialization type - /// Factory that creates new http requests, in the event of retry the factory is called again - /// and must be idempotent - /// A for controlling the lifetime of the asynchronous operation. - /// If true, the authentication header is set with an authentication token - /// - protected async Task HttpGetRequestWithRetryPolicyAsync( - Func httpRequestFactory, CancellationToken cancellationToken, bool setAuthorizationHeader = false) - { - return await cachingRetryHandler.ExecuteWithRetryAndConversionAsync(async ct => - { - var request = httpRequestFactory(); - - if (setAuthorizationHeader) - { - await AddAuthorizationHeaderToRequestAsync(request, ct); - } - - return await HttpClient.SendAsync(request, ct); - }, - GetApiResponseContentAsync, cancellationToken); - } - /// /// Sends a Http Get request to the URL and deserializes the body response to the specified type /// @@ -262,6 +236,32 @@ private async Task CreateGetHttpRequest(Uri requestUrl, bool return httpRequest; } + /// + /// Sends an Http request to the URL and deserializes the body response to the specified type + /// + /// Response's content deserialization type + /// Factory that creates new http requests, in the event of retry the factory is called again + /// and must be idempotent + /// A for controlling the lifetime of the asynchronous operation. + /// If true, the authentication header is set with an authentication token + /// + protected async Task HttpGetRequestWithRetryPolicyAsync( + Func httpRequestFactory, CancellationToken cancellationToken, bool setAuthorizationHeader = false) + { + return await cachingRetryHandler.ExecuteWithRetryAndConversionAsync(async ct => + { + var request = httpRequestFactory(); + + if (setAuthorizationHeader) + { + await AddAuthorizationHeaderToRequestAsync(request, ct); + } + + return await HttpClient.SendAsync(request, ct); + }, + GetApiResponseContentAsync, cancellationToken); + } + private async Task AddAuthorizationHeaderToRequestAsync(HttpRequestMessage requestMessage, CancellationToken cancellationToken) { if (string.IsNullOrEmpty(tokenScope)) From b3f3e6c1bf4d713f3881a1a22e5db3f0c12c95e3 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 18 Dec 2023 22:03:21 -0800 Subject: [PATCH 118/202] Fix code doc comments --- src/Tes.ApiClients/HttpApiClient.cs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index f9636d47f..58e325f4d 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -119,12 +119,11 @@ protected async Task HttpSendRequestWithRetryPolicyAsync( /// /// Sends a Http Get request to the URL and deserializes the body response to the specified type /// - /// Response's content deserialization type /// /// /// /// A for controlling the lifetime of the asynchronous operation. - /// + /// Response's content deserialization type /// protected async Task HttpGetRequestAsync(Uri requestUrl, bool setAuthorizationHeader, bool cacheResults, CancellationToken cancellationToken) @@ -140,10 +139,10 @@ protected async Task HttpGetRequestAsync(Uri requestUrl, b /// /// Checks the cache and if the request was not found, sends the GET request with a retry policy /// - /// Response's content deserialization type /// /// A for controlling the lifetime of the asynchronous operation. /// + /// Response's content deserialization type /// protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri requestUrl, CancellationToken cancellationToken, bool setAuthorizationHeader = false) @@ -163,10 +162,10 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync /// Get request with retry policy /// - /// Response's content deserialization type /// /// A for controlling the lifetime of the asynchronous operation. /// + /// Response's content deserialization type /// protected async Task HttpGetRequestWithRetryPolicyAsync(Uri requestUrl, CancellationToken cancellationToken, bool setAuthorizationHeader = false) @@ -239,11 +238,11 @@ private async Task CreateGetHttpRequest(Uri requestUrl, bool /// /// Sends an Http request to the URL and deserializes the body response to the specified type /// - /// Response's content deserialization type /// Factory that creates new http requests, in the event of retry the factory is called again /// and must be idempotent /// A for controlling the lifetime of the asynchronous operation. /// If true, the authentication header is set with an authentication token + /// Response's content deserialization type /// protected async Task HttpGetRequestWithRetryPolicyAsync( Func httpRequestFactory, CancellationToken cancellationToken, bool setAuthorizationHeader = false) @@ -346,13 +345,13 @@ private string ToHash(string input) /// /// Response /// A for controlling the lifetime of the asynchronous operation. - /// Response's content deserialization type + /// Response's content deserialization type /// - protected static async Task GetApiResponseContentAsync(HttpResponseMessage response, CancellationToken cancellationToken) + protected static async Task GetApiResponseContentAsync(HttpResponseMessage response, CancellationToken cancellationToken) { response.EnsureSuccessStatusCode(); - return JsonSerializer.Deserialize(await ReadResponseBodyAsync(response, cancellationToken))!; + return JsonSerializer.Deserialize(await ReadResponseBodyAsync(response, cancellationToken))!; } /// From a24a903578964a9dff991a5682b88afc9f4b644d Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 18 Dec 2023 22:21:13 -0800 Subject: [PATCH 119/202] fix test --- src/Tes.ApiClients.Tests/PriceApiClientTests.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Tes.ApiClients.Tests/PriceApiClientTests.cs b/src/Tes.ApiClients.Tests/PriceApiClientTests.cs index 8ff9d8c11..7517f09b5 100644 --- a/src/Tes.ApiClients.Tests/PriceApiClientTests.cs +++ b/src/Tes.ApiClients.Tests/PriceApiClientTests.cs @@ -48,11 +48,11 @@ public async Task GetPricingInformationPageAsync_ReturnsSinglePageAndCaches() { var page = await pricingApiClient.GetPricingInformationPageAsync(0, "westus2", CancellationToken.None, cacheResults: true); var cacheKey = await pricingApiClient.ToCacheKeyAsync(new Uri(page.RequestLink), false, CancellationToken.None); - var cachedPage = JsonSerializer.Deserialize(appCache.Get(cacheKey)!); + var cachedPage = appCache.Get(cacheKey); Assert.IsNotNull(page); Assert.IsTrue(page.Items.Length > 0); Assert.IsNotNull(cachedPage); - Assert.IsTrue(page.Items.Length == cachedPage.Items.Length); + Assert.IsTrue(Enumerable.SequenceEqual(cachedPage.Items, page.Items)); } [TestMethod] From 247cdd6401783184fdbd9a50a54fca4a4e0a6c74 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 20 Dec 2023 22:36:56 -0800 Subject: [PATCH 120/202] formatting --- src/deploy-tes-on-azure/Deployer.cs | 260 ++++++++++++++-------------- 1 file changed, 130 insertions(+), 130 deletions(-) diff --git a/src/deploy-tes-on-azure/Deployer.cs b/src/deploy-tes-on-azure/Deployer.cs index a1961cd7b..8edd800fc 100644 --- a/src/deploy-tes-on-azure/Deployer.cs +++ b/src/deploy-tes-on-azure/Deployer.cs @@ -360,117 +360,117 @@ await Execute("Connecting to Azure Services...", async () => await PerformHelmDeploymentAsync(resourceGroup); } - if (!configuration.Update) - { - if (string.IsNullOrWhiteSpace(configuration.BatchPrefix)) + if (!configuration.Update) { - var blob = new byte[5]; - RandomNumberGenerator.Fill(blob); - configuration.BatchPrefix = blob.ConvertToBase32().TrimEnd('='); - } + if (string.IsNullOrWhiteSpace(configuration.BatchPrefix)) + { + var blob = new byte[5]; + RandomNumberGenerator.Fill(blob); + configuration.BatchPrefix = blob.ConvertToBase32().TrimEnd('='); + } - ValidateRegionName(configuration.RegionName); - ValidateMainIdentifierPrefix(configuration.MainIdentifierPrefix); - storageAccount = await ValidateAndGetExistingStorageAccountAsync(); - batchAccount = await ValidateAndGetExistingBatchAccountAsync(); - aksCluster = await ValidateAndGetExistingAKSClusterAsync(); - postgreSqlFlexServer = await ValidateAndGetExistingPostgresqlServerAsync(); - var keyVault = await ValidateAndGetExistingKeyVaultAsync(); + ValidateRegionName(configuration.RegionName); + ValidateMainIdentifierPrefix(configuration.MainIdentifierPrefix); + storageAccount = await ValidateAndGetExistingStorageAccountAsync(); + batchAccount = await ValidateAndGetExistingBatchAccountAsync(); + aksCluster = await ValidateAndGetExistingAKSClusterAsync(); + postgreSqlFlexServer = await ValidateAndGetExistingPostgresqlServerAsync(); + var keyVault = await ValidateAndGetExistingKeyVaultAsync(); - if (aksCluster is null && !configuration.ManualHelmDeployment) - { - await ValidateVmAsync(); - } + if (aksCluster is null && !configuration.ManualHelmDeployment) + { + await ValidateVmAsync(); + } - ConsoleEx.WriteLine($"Deploying TES on Azure version {targetVersion}..."); + ConsoleEx.WriteLine($"Deploying TES on Azure version {targetVersion}..."); - // Configuration preferences not currently settable by user. - if (string.IsNullOrWhiteSpace(configuration.PostgreSqlServerName)) - { - configuration.PostgreSqlServerName = SdkContext.RandomResourceName($"{configuration.MainIdentifierPrefix}-", 15); - } + // Configuration preferences not currently settable by user. + if (string.IsNullOrWhiteSpace(configuration.PostgreSqlServerName)) + { + configuration.PostgreSqlServerName = SdkContext.RandomResourceName($"{configuration.MainIdentifierPrefix}-", 15); + } - configuration.PostgreSqlAdministratorPassword = PasswordGenerator.GeneratePassword(); - configuration.PostgreSqlTesUserPassword = PasswordGenerator.GeneratePassword(); + configuration.PostgreSqlAdministratorPassword = PasswordGenerator.GeneratePassword(); + configuration.PostgreSqlTesUserPassword = PasswordGenerator.GeneratePassword(); - if (string.IsNullOrWhiteSpace(configuration.BatchAccountName)) - { - configuration.BatchAccountName = SdkContext.RandomResourceName($"{configuration.MainIdentifierPrefix}", 15); - } + if (string.IsNullOrWhiteSpace(configuration.BatchAccountName)) + { + configuration.BatchAccountName = SdkContext.RandomResourceName($"{configuration.MainIdentifierPrefix}", 15); + } - if (string.IsNullOrWhiteSpace(configuration.StorageAccountName)) - { - configuration.StorageAccountName = SdkContext.RandomResourceName($"{configuration.MainIdentifierPrefix}", 24); - } + if (string.IsNullOrWhiteSpace(configuration.StorageAccountName)) + { + configuration.StorageAccountName = SdkContext.RandomResourceName($"{configuration.MainIdentifierPrefix}", 24); + } - //if (string.IsNullOrWhiteSpace(configuration.NetworkSecurityGroupName)) - //{ - // configuration.NetworkSecurityGroupName = SdkContext.RandomResourceName($"{configuration.MainIdentifierPrefix}", 15); - //} + //if (string.IsNullOrWhiteSpace(configuration.NetworkSecurityGroupName)) + //{ + // configuration.NetworkSecurityGroupName = SdkContext.RandomResourceName($"{configuration.MainIdentifierPrefix}", 15); + //} - if (string.IsNullOrWhiteSpace(configuration.ApplicationInsightsAccountName)) - { - configuration.ApplicationInsightsAccountName = SdkContext.RandomResourceName($"{configuration.MainIdentifierPrefix}-", 15); - } + if (string.IsNullOrWhiteSpace(configuration.ApplicationInsightsAccountName)) + { + configuration.ApplicationInsightsAccountName = SdkContext.RandomResourceName($"{configuration.MainIdentifierPrefix}-", 15); + } - if (string.IsNullOrWhiteSpace(configuration.TesPassword)) - { - configuration.TesPassword = PasswordGenerator.GeneratePassword(); - } + if (string.IsNullOrWhiteSpace(configuration.TesPassword)) + { + configuration.TesPassword = PasswordGenerator.GeneratePassword(); + } - if (string.IsNullOrWhiteSpace(configuration.AksClusterName)) - { - configuration.AksClusterName = SdkContext.RandomResourceName($"{configuration.MainIdentifierPrefix}-", 25); - } + if (string.IsNullOrWhiteSpace(configuration.AksClusterName)) + { + configuration.AksClusterName = SdkContext.RandomResourceName($"{configuration.MainIdentifierPrefix}-", 25); + } - if (string.IsNullOrWhiteSpace(configuration.KeyVaultName)) - { - configuration.KeyVaultName = SdkContext.RandomResourceName($"{configuration.MainIdentifierPrefix}-", 15); - } + if (string.IsNullOrWhiteSpace(configuration.KeyVaultName)) + { + configuration.KeyVaultName = SdkContext.RandomResourceName($"{configuration.MainIdentifierPrefix}-", 15); + } - await RegisterResourceProvidersAsync(); - await RegisterResourceProviderFeaturesAsync(); + await RegisterResourceProvidersAsync(); + await RegisterResourceProviderFeaturesAsync(); - if (batchAccount is null) - { - await ValidateBatchAccountQuotaAsync(); - } + if (batchAccount is null) + { + await ValidateBatchAccountQuotaAsync(); + } - var vnetAndSubnet = await ValidateAndGetExistingVirtualNetworkAsync(); + var vnetAndSubnet = await ValidateAndGetExistingVirtualNetworkAsync(); - if (string.IsNullOrWhiteSpace(configuration.ResourceGroupName)) - { - configuration.ResourceGroupName = SdkContext.RandomResourceName($"{configuration.MainIdentifierPrefix}-", 15); - resourceGroup = await CreateResourceGroupAsync(); - isResourceGroupCreated = true; - } - else - { - resourceGroup = await azureSubscriptionClient.ResourceGroups.GetByNameAsync(configuration.ResourceGroupName, cts.Token); - } + if (string.IsNullOrWhiteSpace(configuration.ResourceGroupName)) + { + configuration.ResourceGroupName = SdkContext.RandomResourceName($"{configuration.MainIdentifierPrefix}-", 15); + resourceGroup = await CreateResourceGroupAsync(); + isResourceGroupCreated = true; + } + else + { + resourceGroup = await azureSubscriptionClient.ResourceGroups.GetByNameAsync(configuration.ResourceGroupName, cts.Token); + } - // Derive TES ingress URL from resource group name - kubernetesManager.SetTesIngressNetworkingConfiguration(configuration.ResourceGroupName); + // Derive TES ingress URL from resource group name + kubernetesManager.SetTesIngressNetworkingConfiguration(configuration.ResourceGroupName); - managedIdentity = await CreateUserManagedIdentityAsync(resourceGroup); + managedIdentity = await CreateUserManagedIdentityAsync(resourceGroup); - if (vnetAndSubnet is not null) - { - ConsoleEx.WriteLine($"Creating VM in existing virtual network {vnetAndSubnet.Value.virtualNetwork.Name} and subnet {vnetAndSubnet.Value.vmSubnet.Name}"); - } + if (vnetAndSubnet is not null) + { + ConsoleEx.WriteLine($"Creating VM in existing virtual network {vnetAndSubnet.Value.virtualNetwork.Name} and subnet {vnetAndSubnet.Value.vmSubnet.Name}"); + } - if (storageAccount is not null) - { - ConsoleEx.WriteLine($"Using existing Storage Account {storageAccount.Name}"); - } + if (storageAccount is not null) + { + ConsoleEx.WriteLine($"Using existing Storage Account {storageAccount.Name}"); + } - if (batchAccount is not null) - { - ConsoleEx.WriteLine($"Using existing Batch Account {batchAccount.Name}"); - } + if (batchAccount is not null) + { + ConsoleEx.WriteLine($"Using existing Batch Account {batchAccount.Name}"); + } - await Task.WhenAll(new Task[] - { + await Task.WhenAll(new Task[] + { Task.Run(async () => { if (vnetAndSubnet is null) @@ -506,26 +506,26 @@ await Task.WhenAll(new Task[] await AssignManagedIdOperatorToResourceAsync(managedIdentity, resourceGroup); await AssignMIAsNetworkContributorToResourceAsync(managedIdentity, resourceGroup); }), - }); + }); - if (configuration.CrossSubscriptionAKSDeployment.GetValueOrDefault()) - { - await Task.Run(async () => + if (configuration.CrossSubscriptionAKSDeployment.GetValueOrDefault()) { - keyVault ??= await CreateKeyVaultAsync(configuration.KeyVaultName, managedIdentity, vnetAndSubnet.Value.vmSubnet); - keyVaultUri = keyVault.Properties.VaultUri; - var keys = await storageAccount.GetKeysAsync(); - await SetStorageKeySecret(keyVaultUri, StorageAccountKeySecretName, keys[0].Value); - }); - } + await Task.Run(async () => + { + keyVault ??= await CreateKeyVaultAsync(configuration.KeyVaultName, managedIdentity, vnetAndSubnet.Value.vmSubnet); + keyVaultUri = keyVault.Properties.VaultUri; + var keys = await storageAccount.GetKeysAsync(); + await SetStorageKeySecret(keyVaultUri, StorageAccountKeySecretName, keys[0].Value); + }); + } - if (postgreSqlFlexServer is null) - { - postgreSqlDnsZone = await CreatePrivateDnsZoneAsync(vnetAndSubnet.Value.virtualNetwork, $"privatelink.postgres.database.azure.com", "PostgreSQL Server"); - } + if (postgreSqlFlexServer is null) + { + postgreSqlDnsZone = await CreatePrivateDnsZoneAsync(vnetAndSubnet.Value.virtualNetwork, $"privatelink.postgres.database.azure.com", "PostgreSQL Server"); + } - await Task.WhenAll(new[] - { + await Task.WhenAll(new[] + { Task.Run(async () => { if (aksCluster is null && !configuration.ManualHelmDeployment) @@ -548,38 +548,38 @@ await Task.WhenAll(new[] }) }); - var clientId = managedIdentity.ClientId; - var settings = ConfigureSettings(clientId); + var clientId = managedIdentity.ClientId; + var settings = ConfigureSettings(clientId); - await kubernetesManager.UpdateHelmValuesAsync(storageAccount, keyVaultUri, resourceGroup.Name, settings, managedIdentity); - await PerformHelmDeploymentAsync(resourceGroup, - new[] - { + await kubernetesManager.UpdateHelmValuesAsync(storageAccount, keyVaultUri, resourceGroup.Name, settings, managedIdentity); + await PerformHelmDeploymentAsync(resourceGroup, + new[] + { "Run the following postgresql command to setup the database.", $"\tPostgreSQL command: psql postgresql://{configuration.PostgreSqlAdministratorLogin}:{configuration.PostgreSqlAdministratorPassword}@{configuration.PostgreSqlServerName}.postgres.database.azure.com/{configuration.PostgreSqlTesDatabaseName} -c \"{GetCreateTesUserString()}\"" - }, - async kubernetesClient => - { - await kubernetesManager.DeployCoADependenciesAsync(); + }, + async kubernetesClient => + { + await kubernetesManager.DeployCoADependenciesAsync(); - // Deploy an ubuntu pod to run PSQL commands, then delete it - const string deploymentNamespace = "default"; - var (deploymentName, ubuntuDeployment) = KubernetesManager.GetUbuntuDeploymentTemplate(); - await kubernetesClient.AppsV1.CreateNamespacedDeploymentAsync(ubuntuDeployment, deploymentNamespace, cancellationToken: cts.Token); - await ExecuteQueriesOnAzurePostgreSQLDbFromK8(kubernetesClient, deploymentName, deploymentNamespace); - await kubernetesClient.AppsV1.DeleteNamespacedDeploymentAsync(deploymentName, deploymentNamespace, cancellationToken: cts.Token); + // Deploy an ubuntu pod to run PSQL commands, then delete it + const string deploymentNamespace = "default"; + var (deploymentName, ubuntuDeployment) = KubernetesManager.GetUbuntuDeploymentTemplate(); + await kubernetesClient.AppsV1.CreateNamespacedDeploymentAsync(ubuntuDeployment, deploymentNamespace, cancellationToken: cts.Token); + await ExecuteQueriesOnAzurePostgreSQLDbFromK8(kubernetesClient, deploymentName, deploymentNamespace); + await kubernetesClient.AppsV1.DeleteNamespacedDeploymentAsync(deploymentName, deploymentNamespace, cancellationToken: cts.Token); - if (configuration.EnableIngress.GetValueOrDefault()) - { - await Execute( - $"Enabling Ingress {kubernetesManager.TesHostname}", - async () => - { - _ = await kubernetesManager.EnableIngress(configuration.TesUsername, configuration.TesPassword, kubernetesClient); - }); - } - }); + if (configuration.EnableIngress.GetValueOrDefault()) + { + await Execute( + $"Enabling Ingress {kubernetesManager.TesHostname}", + async () => + { + _ = await kubernetesManager.EnableIngress(configuration.TesUsername, configuration.TesPassword, kubernetesClient); + }); + } + }); } if (configuration.OutputTesCredentialsJson.GetValueOrDefault()) From 1d65f72fb52031c08a686ff8b19601a2742f081f Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 22 Dec 2023 19:45:51 -0800 Subject: [PATCH 121/202] queued task pipeline (batch api call reductions) --- src/TesApi.Tests/BatchSchedulerTests.cs | 60 +++-- src/TesApi.Web/AzureProxy.cs | 9 +- src/TesApi.Web/BatchPool.cs | 65 +++-- src/TesApi.Web/BatchPools.BatchScheduler.cs | 1 - src/TesApi.Web/BatchScheduler.cs | 252 +++++++++++------- .../CachingWithRetriesAzureProxy.cs | 4 +- src/TesApi.Web/IAzureProxy.cs | 5 +- .../Quotas/CheckGroupPoolAndJobQuotaResult.cs | 4 +- src/TesApi.Web/RelatedTask.cs | 5 +- 9 files changed, 237 insertions(+), 168 deletions(-) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 8330965d3..1a1318f9a 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -607,8 +607,8 @@ public Task AddBatchTaskHandlesBatchClientException() return AddBatchTasksHandlesExceptions(TesState.SYSTEMERROREnum, Arranger, Validator); (Action, Action>) Arranger(AzureProxyReturnValues _1) - => (default, azureProxy => azureProxy.Setup(b => b.AddBatchTaskAsync(It.IsAny(), It.IsAny(), It.IsAny(), It.IsAny())) - .Callback((_1, _2, _3, _4) + => (default, azureProxy => azureProxy.Setup(b => b.AddBatchTasksAsync(It.IsAny>(), It.IsAny(), It.IsAny())) + .Callback, string, CancellationToken>((_1, _2, _3) => throw typeof(BatchClientException) .GetConstructor(System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance, new[] { typeof(string), typeof(Exception) }) @@ -740,10 +740,9 @@ public async Task BatchJobContainsExpectedBatchPoolInformation() await foreach (var _ in batchScheduler.ProcessQueuedTesTasksAsync(new[] { tesTask }, CancellationToken.None)) { } var createBatchPoolAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.CreateBatchPoolAsync)); - var addBatchTaskAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.AddBatchTaskAsync)); + var addBatchTaskAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.AddBatchTasksAsync)); - var cloudTask = addBatchTaskAsyncInvocation?.Arguments[1] as CloudTask; - var poolId = addBatchTaskAsyncInvocation?.Arguments[2] as string; + var poolId = addBatchTaskAsyncInvocation?.Arguments[1] as string; var pool = createBatchPoolAsyncInvocation?.Arguments[0] as Pool; GuardAssertsWithTesTask(tesTask, () => @@ -793,7 +792,7 @@ public async Task PreemptibleTesTaskGetsScheduledToLowPriorityVm() var tesTask = GetTesTask(); tesTask.Resources.Preemptible = true; - (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); + (_, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); GuardAssertsWithTesTask(tesTask, () => { @@ -809,7 +808,7 @@ public async Task NonPreemptibleTesTaskGetsScheduledToDedicatedVm() var tesTask = GetTesTask(); tesTask.Resources.Preemptible = false; - (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); + (_, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); GuardAssertsWithTesTask(tesTask, () => { @@ -825,7 +824,7 @@ public async Task PreemptibleTesTaskGetsScheduledToLowPriorityVm_PerVMFamilyEnfo var tesTask = GetTesTask(); tesTask.Resources.Preemptible = true; - (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.DefaultsPerVMFamilyEnforced), AzureProxyReturnValues.DefaultsPerVMFamilyEnforced); + (_, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.DefaultsPerVMFamilyEnforced), AzureProxyReturnValues.DefaultsPerVMFamilyEnforced); GuardAssertsWithTesTask(tesTask, () => { @@ -841,7 +840,7 @@ public async Task NonPreemptibleTesTaskGetsScheduledToDedicatedVm_PerVMFamilyEnf var tesTask = GetTesTask(); tesTask.Resources.Preemptible = false; - (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.DefaultsPerVMFamilyEnforced), AzureProxyReturnValues.DefaultsPerVMFamilyEnforced); + (_, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.DefaultsPerVMFamilyEnforced), AzureProxyReturnValues.DefaultsPerVMFamilyEnforced); GuardAssertsWithTesTask(tesTask, () => { @@ -861,7 +860,7 @@ public async Task NonPreemptibleTesTaskGetsWarningAndIsScheduledToLowPriorityVmI var azureProxyReturnValues = AzureProxyReturnValues.DefaultsPerVMFamilyEnforced; azureProxyReturnValues.VmSizesAndPrices.First(vm => vm.VmSize.Equals("VmSize3", StringComparison.OrdinalIgnoreCase)).PricePerHour = 44; - (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); + (_, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), azureProxyReturnValues); GuardAssertsWithTesTask(tesTask, () => { @@ -879,7 +878,7 @@ public async Task TesTaskGetsScheduledToLowPriorityVmIfSettingUsePreemptibleVmsO var config = GetMockConfig()() .Append(("BatchScheduling:UsePreemptibleVmsOnly", "true")); - (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, config, GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); + (_, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, config, GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); GuardAssertsWithTesTask(tesTask, () => Assert.IsTrue(pool.ScaleSettings.AutoScale.Formula.Contains("\n$TargetLowPriorityNodes = "))); } @@ -895,7 +894,7 @@ static async Task RunTest(string allowedVmSizes, TesState expectedTaskState, str var config = GetMockConfig()() .Append(("AllowedVmSizes", allowedVmSizes)); - (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, config, GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); + (_, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, config, GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); GuardAssertsWithTesTask(tesTask, () => { @@ -1422,7 +1421,7 @@ public async Task PrivateImagesArePulledUsingPoolConfiguration() GetMockAzureProxy(AzureProxyReturnValues.Defaults)(mock); azureProxy = mock; }); - (_, var cloudTask, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, AzureProxyReturnValues.Defaults); + (_, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, AzureProxyReturnValues.Defaults); var batchScript = (string)azureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.UploadBlobAsync) && i.Arguments[0].ToString().Contains("/batch_script"))?.Arguments[1]; GuardAssertsWithTesTask(tesTask, () => @@ -1447,7 +1446,7 @@ public async Task PublicImagesArePulledInTaskCommand() GetMockAzureProxy(AzureProxyReturnValues.Defaults)(mock); azureProxy = mock; }); - (_, var cloudTask, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, AzureProxyReturnValues.Defaults); + (_, var cloudTasks, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, AzureProxyReturnValues.Defaults); var batchScript = (string)azureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.UploadBlobAsync) && i.Arguments[0].ToString().Contains("/batch_script"))?.Arguments[1]; GuardAssertsWithTesTask(tesTask, () => @@ -1464,12 +1463,12 @@ public async Task PrivateContainersRunInsideDockerInDockerContainer() { var tesTask = GetTesTask(); - (_, var cloudTask, _, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); + (_, var cloudTasks, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); GuardAssertsWithTesTask(tesTask, () => { - Assert.IsNotNull(cloudTask.ContainerSettings); - Assert.AreEqual("docker", cloudTask.ContainerSettings.ImageName); + Assert.IsNotNull(cloudTasks.FirstOrDefault()?.ContainerSettings); + Assert.AreEqual("docker", cloudTasks.FirstOrDefault()?.ContainerSettings.ImageName); }); } @@ -1479,9 +1478,13 @@ public async Task PublicContainersRunInsideRegularTaskCommand() var tesTask = GetTesTask(); tesTask.Executors.First().Image = "ubuntu"; - (_, var cloudTask, _, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); + (_, var cloudTasks, _) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); - GuardAssertsWithTesTask(tesTask, () => Assert.IsNull(cloudTask.ContainerSettings)); + GuardAssertsWithTesTask(tesTask, () => + { + Assert.IsNotNull(cloudTasks.FirstOrDefault()); + Assert.IsNull(cloudTasks.FirstOrDefault()?.ContainerSettings); + }); } [DataTestMethod] @@ -1593,7 +1596,7 @@ public async Task PoolIsCreatedInSubnetWhenBatchNodesSubnetIdIsSet() var tesTask = GetTesTask(); var azureProxy = GetMockAzureProxy(AzureProxyReturnValues.Defaults); - (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, config, azureProxy, AzureProxyReturnValues.Defaults); + (_, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, config, azureProxy, AzureProxyReturnValues.Defaults); var poolNetworkConfiguration = pool.NetworkConfiguration; @@ -1614,7 +1617,7 @@ public async Task PoolIsCreatedWithoutPublicIpWhenSubnetAndDisableBatchNodesPubl var tesTask = GetTesTask(); var azureProxy = GetMockAzureProxy(AzureProxyReturnValues.Defaults); - (_, _, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, config, azureProxy, AzureProxyReturnValues.Defaults); + (_, _, var pool) = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, config, azureProxy, AzureProxyReturnValues.Defaults); var poolNetworkConfiguration = pool.NetworkConfiguration; @@ -1635,13 +1638,13 @@ public async Task PoolIsCreatedWithoutPublicIpWhenSubnetAndDisableBatchNodesPubl return (tesTask.Logs?.LastOrDefault()?.FailureReason, tesTask.Logs?.LastOrDefault()?.SystemLogs?.ToArray()); } - private static Task<(string JobId, CloudTask CloudTask, PoolInformation PoolInformation, Pool batchModelsPool)> ProcessTesTaskAndGetBatchJobArgumentsAsync() + private static Task<(string JobId, IEnumerable CloudTask, Pool batchModelsPool)> ProcessTesTaskAndGetBatchJobArgumentsAsync() => ProcessTesTaskAndGetBatchJobArgumentsAsync(GetTesTask(), GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); - private static Task<(string JobId, CloudTask CloudTask, PoolInformation PoolInformation, Pool batchModelsPool)> ProcessTesTaskAndGetBatchJobArgumentsAsync(TesTask tesTask, IEnumerable<(string Key, string Value)> configuration, Action> azureProxy, AzureProxyReturnValues azureProxyReturnValues, Action additionalActions = default, Action> serviceProviderActions = default) + private static Task<(string JobId, IEnumerable CloudTask, Pool batchModelsPool)> ProcessTesTaskAndGetBatchJobArgumentsAsync(TesTask tesTask, IEnumerable<(string Key, string Value)> configuration, Action> azureProxy, AzureProxyReturnValues azureProxyReturnValues, Action additionalActions = default, Action> serviceProviderActions = default) => ProcessTesTasksAndGetBatchJobArgumentsAsync(new[] { tesTask }, configuration, azureProxy, azureProxyReturnValues, additionalActions, serviceProviderActions); - private static async Task<(string JobId, CloudTask CloudTask, PoolInformation PoolInformation, Pool batchModelsPool)> ProcessTesTasksAndGetBatchJobArgumentsAsync(TesTask[] tesTasks, IEnumerable<(string Key, string Value)> configuration, Action> azureProxy, AzureProxyReturnValues azureProxyReturnValues, Action additionalActions = default, Action> serviceProviderActions = default) + private static async Task<(string JobId, IEnumerable CloudTasks, Pool batchModelsPool)> ProcessTesTasksAndGetBatchJobArgumentsAsync(TesTask[] tesTasks, IEnumerable<(string Key, string Value)> configuration, Action> azureProxy, AzureProxyReturnValues azureProxyReturnValues, Action additionalActions = default, Action> serviceProviderActions = default) { using var serviceProvider = GetServiceProvider( configuration, @@ -1663,14 +1666,13 @@ public async Task PoolIsCreatedWithoutPublicIpWhenSubnetAndDisableBatchNodesPubl } var createBatchPoolAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.CreateBatchPoolAsync)); - var addBatchTaskAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.AddBatchTaskAsync)); + var addBatchTaskAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.AddBatchTasksAsync)); - var jobId = (addBatchTaskAsyncInvocation?.Arguments[0]) as string; - var cloudTask = (addBatchTaskAsyncInvocation?.Arguments[1]) as CloudTask; - var poolInformation = (addBatchTaskAsyncInvocation?.Arguments[2]) as PoolInformation; + var jobId = (addBatchTaskAsyncInvocation?.Arguments[1]) as string; + var cloudTask = (addBatchTaskAsyncInvocation?.Arguments[0]) as IEnumerable; var batchPoolsModel = createBatchPoolAsyncInvocation?.Arguments[0] as Pool; - return (jobId, cloudTask, poolInformation, batchPoolsModel); + return (jobId, cloudTask, batchPoolsModel); } private static Action> GetMockAllowedVms(IEnumerable<(string Key, string Value)> configuration) diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 149184ff0..df9e3d8b9 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -171,17 +171,18 @@ public async Task CreateBatchJobAsync(string jobId, CancellationToken cancellati } /// - public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, string jobId, CancellationToken cancellationToken) + public async Task AddBatchTasksAsync(IEnumerable cloudTasks, string jobId, CancellationToken cancellationToken) { ArgumentException.ThrowIfNullOrEmpty(jobId); + ArgumentNullException.ThrowIfNull(cloudTasks); + + cloudTasks = cloudTasks.ToList(); - logger.LogInformation("TES task: {TesTask} - Adding task to job {BatchJob}", tesTaskId, jobId); var job = await batchRetryPolicyWhenJobNotFound.ExecuteWithRetryAsync(ct => batchClient.JobOperations.GetJobAsync(jobId, cancellationToken: ct), cancellationToken); - await job.AddTaskAsync(cloudTask, cancellationToken: cancellationToken); - logger.LogInformation("TES task: {TesTask} - Added task successfully", tesTaskId); + await job.AddTaskAsync(cloudTasks, new() { CancellationToken = cancellationToken, MaxDegreeOfParallelism = (int)Math.Ceiling((double)cloudTasks.Count() / Microsoft.Azure.Batch.Constants.MaxTasksInSingleAddTaskCollectionRequest) }); } /// diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 0e167feb6..e55643425 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -271,10 +271,10 @@ public static string AutoPoolFormula(bool preemptable, int initialTarget) This is accomplished by calling doubleVec's GetSample method, which returns some number of the most recent available samples of the related metric. Then, a function is used to extract a scaler from the list of scalers (measurements). NOTE: there does not seem to be a "last" function. - Whenever autoscaling is turned on, whether or not the pool waw just created, there are no sampled metrics available. Thus, we need to prevent the + Whenever autoscaling is turned on, whether or not the pool was just created, there are no sampled metrics available. Thus, we need to prevent the expected errors that would result from trying to extract the samples. Later on, if recent samples aren't available, we prefer that the formula fails - (1- so we can potentially capture that, and 2- so that we don't suddenly try to remove all nodes from the pool when there's still demand) so we use a - timed scheme to substitue an "initial value" (aka initialTarget). + (firstly, so we can potentially capture that, and secondly, so that we don't suddenly try to remove all nodes from the pool when there's still demand) + so we use a timed scheme to substitue an "initial value" (aka initialTarget). We set NodeDeallocationOption to taskcompletion to prevent wasting time/money by stopping a running task, only to requeue it onto another node, or worse, fail it, just because batch's last sample was taken longer ago than a task's assignment was made to a node, because the formula evaluations intervals are not coordinated @@ -584,7 +584,7 @@ await PerformTask(ServicePoolAsync(ServiceKind.Rotate, cancellationToken), cance static IEnumerable Flatten(Exception ex) => ex switch { - AggregateException aggregateException => aggregateException.InnerExceptions, + AggregateException aggregateException => aggregateException.Flatten().InnerExceptions, _ => Enumerable.Empty().Append(ex), }; @@ -609,34 +609,50 @@ async ValueTask PerformTask(ValueTask serviceAction, CancellationToken can } // Returns true when pool/job was removed because it was not found. Returns false otherwise. - async ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken cancellationToken) + ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken cancellationToken) { - switch (ex) + return ex switch { - case AggregateException aggregateException: - var result = false; + AggregateException aggregateException => ParseAggregateException(aggregateException, cancellationToken), + BatchException batchException => ParseBatchException(batchException, cancellationToken), + _ => ParseException(ex, cancellationToken), + }; - foreach (var e in aggregateException.InnerExceptions) - { - result |= await RemoveMissingPoolsAsync(e, cancellationToken); - } + ValueTask ParseException(Exception exception, CancellationToken cancellationToken) + { + if (exception.InnerException is not null) + { + return RemoveMissingPoolsAsync(exception.InnerException, cancellationToken); + } - return result; + return ValueTask.FromResult(false); + } - case BatchException batchException: - if (batchException.RequestInformation.BatchError.Code == BatchErrorCodeStrings.PoolNotFound || - batchException.RequestInformation.BatchError.Code == BatchErrorCodeStrings.JobNotFound) - { - _logger.LogError(ex, "Batch pool and/or job {PoolId} is missing. Removing them from TES's active pool list.", Id); - _ = _batchPools.RemovePoolFromList(this); - await _batchPools.DeletePoolAndJobAsync(this, cancellationToken); - return true; - } + async ValueTask ParseAggregateException(AggregateException aggregateException, CancellationToken cancellationToken) + { + var result = false; - break; + foreach (var exception in aggregateException.InnerExceptions) + { + result |= await RemoveMissingPoolsAsync(exception, cancellationToken); + } + + return result; } - return false; + async ValueTask ParseBatchException(BatchException batchException, CancellationToken cancellationToken) + { + if (batchException.RequestInformation.BatchError.Code == BatchErrorCodeStrings.PoolNotFound || + batchException.RequestInformation.BatchError.Code == BatchErrorCodeStrings.JobNotFound) + { + _logger.LogError(batchException, "Batch pool and/or job {PoolId} is missing. Removing them from TES's active pool list.", Id); + _ = _batchPools.RemovePoolFromList(this); + await _batchPools.DeletePoolAndJobAsync(this, cancellationToken); + return true; + } + + return false; + } } } @@ -645,6 +661,7 @@ async ValueTask RemoveMissingPoolsAsync(Exception ex, CancellationToken ca private Lazy>> _lazyComputeNodes; private const string EjectableComputeNodesFilterClause = @"state eq 'starttaskfailed' or state eq 'preempted' or state eq 'unusable'"; + private string EjectableComputeNodesSelectClause() => ScalingMode.AutoScaleDisabled.Equals(_scalingMode) switch { diff --git a/src/TesApi.Web/BatchPools.BatchScheduler.cs b/src/TesApi.Web/BatchPools.BatchScheduler.cs index 756c2bdb8..4f790b0ff 100644 --- a/src/TesApi.Web/BatchPools.BatchScheduler.cs +++ b/src/TesApi.Web/BatchPools.BatchScheduler.cs @@ -11,7 +11,6 @@ using System.Threading; using System.Threading.Tasks; using CommonUtilities; -using Microsoft.Azure.Batch; using Microsoft.Azure.Batch.Common; using Microsoft.Extensions.Logging; using Tes.Models; diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index db5c22fa8..466bd8a1c 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -563,13 +563,17 @@ public string GetTesTaskIdFromCloudTaskId(string cloudTaskId) private static bool IsCromwellCommandScript(TesInput inputFile) => (inputFile.Name?.Equals("commandScript") ?? false) && (inputFile.Description?.EndsWith(".commandScript") ?? false) && inputFile.Type == TesFileType.FILEEnum && inputFile.Path.EndsWith($"/{CromwellScriptFileName}"); - private record struct QueuedTaskMetadata(TesTask TesTask, VirtualMachineInformation VirtualMachineInfo, IEnumerable Identities, string PoolDisplayName); + private record struct QueuedTaskPoolMetadata(TesTask TesTask, VirtualMachineInformation VirtualMachineInfo, IEnumerable Identities, string PoolDisplayName); + + private record struct QueuedTaskJobMetadata(string PoolKey, string JobId, VirtualMachineInformation VirtualMachineInfo, IEnumerable Tasks); + + private record struct QueuedTaskMetadata(string PoolKey, IEnumerable Tasks); /// public async IAsyncEnumerable> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) { - var tasksMetadataByPoolKey = new ConcurrentDictionary>(); ConcurrentBag> results = new(); // Early item return facilitator + ConcurrentDictionary> tasksPoolMetadataByPoolKey = new(); { logger.LogDebug(@"Checking quota for {QueuedTasks} tasks.", tesTasks.Length); @@ -596,20 +600,13 @@ await Parallel.ForEachAsync(tesTasks, cancellationToken, async (tesTask, token) (poolKey, var displayName) = GetPoolKey(tesTask, virtualMachineInfo, identities); await quotaVerifier.CheckBatchAccountQuotasAsync(virtualMachineInfo, needPoolOrJobQuotaCheck: !IsPoolAvailable(poolKey), cancellationToken: token); - try - { - _ = tasksMetadataByPoolKey.AddOrUpdate(poolKey, - _1 => ImmutableArray.Empty.Add(new(tesTask, virtualMachineInfo, identities, displayName)), - (_1, list) => list.Add(new(tesTask, virtualMachineInfo, identities, displayName))); - } - catch (OverflowException) - { - throw; - } + _ = tasksPoolMetadataByPoolKey.AddOrUpdate(poolKey, + key => ImmutableArray.Empty.Add(new(tesTask, virtualMachineInfo, identities, displayName)), + (key, list) => list.Add(new(tesTask, virtualMachineInfo, identities, displayName))); } - catch (Exception ex) + catch (Exception exception) { - results.Add(new(HandleExceptionAsync(ex, poolKey, tesTask), tesTask)); + TaskCatchException(exception, tesTask, poolKey); } }); } @@ -620,7 +617,7 @@ await Parallel.ForEachAsync(tesTasks, cancellationToken, async (tesTask, token) yield return result; } - if (tasksMetadataByPoolKey.IsEmpty) + if (tasksPoolMetadataByPoolKey.IsEmpty) { yield break; } @@ -628,7 +625,8 @@ await Parallel.ForEachAsync(tesTasks, cancellationToken, async (tesTask, token) results.Clear(); // Determine how many nodes in each possibly new pool we might need for this group of tasks. - var neededPoolNodesByPoolKey = tasksMetadataByPoolKey.ToDictionary(t => t.Key, t => t.Value.Length); + var neededPoolNodesByPoolKey = tasksPoolMetadataByPoolKey.ToDictionary(t => t.Key, t => t.Value.Length); + var tasksJobMetadata = new ConcurrentBag(); { // Determine how many new pools/jobs we need now @@ -638,11 +636,11 @@ await Parallel.ForEachAsync(tesTasks, cancellationToken, async (tesTask, token) // This will remove pool keys we cannot accomodate due to quota, along with all of their associated tasks, from being queued into Batch. if (requiredNewPools.Skip(1).Any()) { - bool TryRemoveKeyAndTasks(string key, out (string Key, ImmutableArray ListOfTaskMetadata) result) + bool TryRemoveKeyAndTasks(string key, out (string Key, ImmutableArray ListOfTaskMetadata) result) { result = default; - if (tasksMetadataByPoolKey.TryRemove(key, out var listOfTaskMetadata)) + if (tasksPoolMetadataByPoolKey.TryRemove(key, out var listOfTaskMetadata)) { result = (key, listOfTaskMetadata); return true; @@ -651,12 +649,12 @@ bool TryRemoveKeyAndTasks(string key, out (string Key, ImmutableArray)>(TryRemoveKeyAndTasks) - .Take(excess)) + .SelectWhere)>(TryRemoveKeyAndTasks) + .Take(exceededQuantity)) { foreach (var task in listOfTaskMetadata.Select(m => m.TesTask)) { @@ -664,97 +662,70 @@ bool TryRemoveKeyAndTasks(string key, out (string Key, ImmutableArray l.Length)); - async ValueTask GetPoolIdAsync(string poolKey, IEnumerable metadata, CancellationToken cancellationToken) - { - metadata = metadata.ToList(); - var tasks = metadata.Select(m => m.TesTask); - var (_, virtualMachineInfo, identities, displayName) = metadata.First(); + logger.LogDebug(@"Obtaining {PoolQuantity} batch pool identifiers for {QueuedTasks} tasks.", tasksPoolMetadataByPoolKey.Count, tasksPoolMetadataByPoolKey.Values.Sum(l => l.Length)); - try - { - var useGen2 = virtualMachineInfo.HyperVGenerations?.Contains("V2"); - return (await GetOrAddPoolAsync( - key: poolKey, - isPreemptable: virtualMachineInfo.LowPriority, - modelPoolFactory: async (id, ct) => await GetPoolSpecification( - name: id, - displayName: displayName, - poolIdentity: GetBatchPoolIdentity(identities.ToArray()), - vmSize: virtualMachineInfo.VmSize, - autoscaled: true, - preemptable: virtualMachineInfo.LowPriority, - initialTarget: neededPoolNodesByPoolKey[poolKey], - nodeInfo: useGen2.GetValueOrDefault() ? gen2BatchNodeInfo : gen1BatchNodeInfo, - encryptionAtHostSupported: virtualMachineInfo.EncryptionAtHostSupported, - cancellationToken: ct), - cancellationToken: cancellationToken)).Id; - } - catch (AggregateException aggregateException) + await Parallel.ForEachAsync(tasksPoolMetadataByPoolKey, cancellationToken, async(pool, token) => { - var exceptions = new List(); - var innerExceptions = aggregateException.Flatten().InnerExceptions; + var (_, virtualMachineInfo, identities, displayName) = pool.Value.First(); - foreach (var tesTask in tasks) + try { - foreach (var partResult in innerExceptions - .Select(ex => HandleExceptionAsync(ex, poolKey, tesTask))) + var useGen2 = virtualMachineInfo.HyperVGenerations?.Contains("V2") ?? false; + var poolId = (await GetOrAddPoolAsync( + key: pool.Key, + isPreemptable: virtualMachineInfo.LowPriority, + modelPoolFactory: async (id, ct) => await GetPoolSpecification( + name: id, + displayName: displayName, + poolIdentity: GetBatchPoolIdentity(identities.ToArray()), + vmSize: virtualMachineInfo.VmSize, + autoscaled: true, + preemptable: virtualMachineInfo.LowPriority, + initialTarget: neededPoolNodesByPoolKey[pool.Key], + nodeInfo: useGen2 ? gen2BatchNodeInfo : gen1BatchNodeInfo, + encryptionAtHostSupported: virtualMachineInfo.EncryptionAtHostSupported, + cancellationToken: ct), + cancellationToken: token)).Id; + + tasksJobMetadata.Add(new(pool.Key, poolId, virtualMachineInfo, pool.Value.Select(tuple => tuple.TesTask))); + } + catch (AggregateException aggregateException) + { + var innerExceptions = aggregateException.Flatten().InnerExceptions; + + foreach (var tesTask in pool.Value.Select(tuple => tuple.TesTask)) { - if (partResult.IsFaulted) - { - exceptions.Add(partResult.Exception); - } + TaskCatchAggregateException(innerExceptions, tesTask, pool.Key); } - - results.Add(new(exceptions.Count == 0 - ? Task.FromResult(true) - : Task.FromException(new AggregateException(exceptions)), - tesTask)); } - } - catch (Exception exception) - { - foreach (var tesTask in tasks) + catch (Exception exception) { - results.Add(new(HandleExceptionAsync(exception, poolKey, tesTask), tesTask)); + foreach (var tesTask in pool.Value.Select(tuple => tuple.TesTask)) + { + TaskCatchException(exception, tesTask, pool.Key); + } } - } - - return null; + }); } - // TODO: Consider parallelizing this expression. Doing so would require making GetOrAddPoolAsync multi-threaded safe. - var tasksMetadata = tasksMetadataByPoolKey.ToAsyncEnumerable().SelectAwaitWithCancellation(async (pair, token) => - (pair.Key, Id: await GetPoolIdAsync(pair.Key, pair.Value, token), TaskMetadata: pair.Value)) - .Where(tuple => tuple.Id is not null) - .SelectMany(tuple => tuple.TaskMetadata.ToAsyncEnumerable().Select(metadata => (metadata.TesTask, metadata.VirtualMachineInfo, tuple.Key, tuple.Id))) - .ToBlockingEnumerable(cancellationToken); - - tasksMetadata = tasksMetadata.ToList(); - // Return any results that are ready foreach (var result in results) { yield return result; } - if (!tasksMetadata.Any()) + if (!tasksJobMetadata.Any()) { yield break; } results.Clear(); - logger.LogDebug(@"Creating batch tasks for {QueuedTasks} tasks.", tasksMetadata.Count()); + var tasksMetadata = new ConcurrentBag(); - // Obtain assigned pool and create and assign the cloudtask for each task. - await Parallel.ForEachAsync(tasksMetadata, cancellationToken, async (metadata, token) => + async Task GetCloudTaskAsync(TesTask tesTask, VirtualMachineInformation virtualMachineInfo, string poolKey, string poolId, CancellationToken cancellationToken) { - var (tesTask, virtualMachineInfo, poolKey, poolId) = metadata; - try { var tesTaskLog = tesTask.AddTesTaskLog(); @@ -764,33 +735,83 @@ await Parallel.ForEachAsync(tasksMetadata, cancellationToken, async (metadata, t var cloudTask = await ConvertTesTaskToBatchTaskUsingRunnerAsync(cloudTaskId, tesTask, cancellationToken); logger.LogInformation(@"Creating batch task for TES task {TesTaskId}. Using VM size {VmSize}.", tesTask.Id, virtualMachineInfo.VmSize); - await azureProxy.AddBatchTaskAsync(tesTask.Id, cloudTask, poolId, cancellationToken); - - tesTaskLog.StartTime = DateTimeOffset.UtcNow; - tesTask.State = TesState.INITIALIZINGEnum; - results.Add(new(Task.FromResult(true), tesTask)); + return cloudTask; } catch (AggregateException aggregateException) { - var exceptions = new List(); + TaskCatchAggregateException(aggregateException.Flatten().InnerExceptions, tesTask, poolKey); + } + catch (Exception exception) + { + TaskCatchException(exception, tesTask, poolKey); + } + + return null; + } + + await Parallel.ForEachAsync( + tasksJobMetadata.Select(metadata => (metadata.JobId, metadata.PoolKey, metadata.Tasks, CloudTasks: metadata.Tasks.Select(task => new RelatedTask(GetCloudTaskAsync(task, metadata.VirtualMachineInfo, metadata.PoolKey, metadata.JobId, cancellationToken), task)).WhenEach(cancellationToken, task => task.Task))), + cancellationToken, + async (metadata, token) => + { + var (jobId, poolKey, tasks, relatedCloudTasks) = metadata; - foreach (var partResult in aggregateException.Flatten().InnerExceptions - .Select(ex => HandleExceptionAsync(ex, poolKey, tesTask))) + try { - if (partResult.IsFaulted) + var cloudTasks = (await relatedCloudTasks.ToListAsync(token)).Where(task => task.Task.Result is not null); + await azureProxy.AddBatchTasksAsync(cloudTasks.Select(task => task.Task.Result), jobId, token); + + tasksMetadata.Add(new(poolKey, cloudTasks.Select(task => task.Related))); + } + catch (AggregateException aggregateException) + { + var innerExceptions = aggregateException.Flatten().InnerExceptions; + + foreach (var tesTask in tasks) + { + TaskCatchAggregateException(innerExceptions, tesTask, poolKey); + } + } + catch (Exception exception) + { + foreach (var tesTask in tasks) { - exceptions.Add(partResult.Exception); + TaskCatchException(exception, tesTask, poolKey); } } + }); + + // Return any results that are ready + foreach (var result in results) + { + yield return result; + } + + if (!tasksMetadata.Any()) + { + yield break; + } + + results.Clear(); - results.Add(new(exceptions.Count == 0 - ? Task.FromResult(true) - : Task.FromException(new AggregateException(exceptions)), - tesTask)); + _ = Parallel.ForEach(tasksMetadata.SelectMany(metadata => metadata.Tasks.Select(task => (task, metadata.PoolKey))), metadata => + { + var (tesTask, poolKey) = metadata; + + try + { + var tesTaskLog = tesTask.GetOrAddTesTaskLog(); + tesTaskLog.StartTime = DateTimeOffset.UtcNow; + tesTask.State = TesState.INITIALIZINGEnum; + results.Add(new(Task.FromResult(true), tesTask)); + } + catch (AggregateException aggregateException) + { + TaskCatchAggregateException(aggregateException.Flatten().InnerExceptions, tesTask, poolKey); } catch (Exception exception) { - results.Add(new(HandleExceptionAsync(exception, poolKey, tesTask), tesTask)); + TaskCatchException(exception, tesTask, poolKey); } }); @@ -801,6 +822,35 @@ await Parallel.ForEachAsync(tasksMetadata, cancellationToken, async (metadata, t yield break; + void TaskCatchException(Exception exception, TesTask tesTask, string poolKey) + { + results.Add(new(HandleExceptionAsync(exception, poolKey, tesTask), tesTask)); + } + + void TaskCatchAggregateException(IEnumerable innerExceptions, TesTask tesTask, string poolKey) + { + var result = false; + var exceptions = new List(); + + foreach (var partResult in innerExceptions + .Select(ex => HandleExceptionAsync(ex, poolKey, tesTask))) + { + if (partResult.IsFaulted) + { + exceptions.Add(partResult.Exception); + } + else + { + result |= partResult.Result; + } + } + + results.Add(new(exceptions.Count == 0 + ? Task.FromResult(result) + : Task.FromException(new AggregateException(exceptions)), + tesTask)); + } + Task HandleExceptionAsync(Exception exception, string poolKey, TesTask tesTask) { switch (exception) diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index daf2359b1..7f3934b73 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -85,11 +85,11 @@ public async Task CreateBatchJobAsync(string jobId, CancellationToken cancellati } /// - public async Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, string jobId, CancellationToken cancellationToken) + public async Task AddBatchTasksAsync(IEnumerable cloudTasks, string jobId, CancellationToken cancellationToken) { try { - await cachingAsyncRetryExceptWhenExists.ExecuteWithRetryAsync(ct => azureProxy.AddBatchTaskAsync(tesTaskId, cloudTask, jobId, ct), cancellationToken); + await cachingAsyncRetryExceptWhenExists.ExecuteWithRetryAsync(ct => azureProxy.AddBatchTasksAsync(cloudTasks, jobId, ct), cancellationToken); } catch (BatchException exc) when (BatchErrorCodeStrings.TaskExists.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } diff --git a/src/TesApi.Web/IAzureProxy.cs b/src/TesApi.Web/IAzureProxy.cs index 8b039b8e2..c45231043 100644 --- a/src/TesApi.Web/IAzureProxy.cs +++ b/src/TesApi.Web/IAzureProxy.cs @@ -26,11 +26,10 @@ public interface IAzureProxy /// /// Adds a to the job."/> /// - /// - /// + /// /// /// A for controlling the lifetime of the asynchronous operation. - Task AddBatchTaskAsync(string tesTaskId, CloudTask cloudTask, string jobId, CancellationToken cancellationToken); + Task AddBatchTasksAsync(IEnumerable cloudTasks, string jobId, CancellationToken cancellationToken); /// /// Terminates and deletes an Azure Batch job for diff --git a/src/TesApi.Web/Management/Models/Quotas/CheckGroupPoolAndJobQuotaResult.cs b/src/TesApi.Web/Management/Models/Quotas/CheckGroupPoolAndJobQuotaResult.cs index 985e02390..043af0ac5 100644 --- a/src/TesApi.Web/Management/Models/Quotas/CheckGroupPoolAndJobQuotaResult.cs +++ b/src/TesApi.Web/Management/Models/Quotas/CheckGroupPoolAndJobQuotaResult.cs @@ -8,7 +8,7 @@ namespace TesApi.Web.Management.Models.Quotas /// /// Result of group checking quota for pools and jobs. /// - /// The number of pools or jobs above the "required" request that exceeded the available quota. + /// The number of pools or jobs above the "required" request that exceeded the available quota. /// The to return to the tasks that could not be accomodated. - public record struct CheckGroupPoolAndJobQuotaResult(int Exceeded, Exception Exception); + public record struct CheckGroupPoolAndJobQuotaResult(int ExceededQuantity, Exception Exception); } diff --git a/src/TesApi.Web/RelatedTask.cs b/src/TesApi.Web/RelatedTask.cs index 4f3f2191a..d46792bf4 100644 --- a/src/TesApi.Web/RelatedTask.cs +++ b/src/TesApi.Web/RelatedTask.cs @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +using System; + namespace TesApi.Web { /// @@ -36,11 +38,10 @@ public System.Runtime.CompilerServices.TaskAwaiter GetAwaiter() /// Faulted, or /// Canceled. /// -#pragma warning disable CA1816 // Dispose methods should call SuppressFinalize public void Dispose() -#pragma warning restore CA1816 // Dispose methods should call SuppressFinalize { Task.Dispose(); + GC.SuppressFinalize(this); } } } From 6dd9f772c6057a584f00b2dc949604f08d9c54cd Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 27 Dec 2023 11:55:53 -0800 Subject: [PATCH 122/202] formatting --- src/TesApi.Web/BatchScheduler.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 466bd8a1c..ddc056b8d 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -665,7 +665,7 @@ bool TryRemoveKeyAndTasks(string key, out (string Key, ImmutableArray l.Length)); - await Parallel.ForEachAsync(tasksPoolMetadataByPoolKey, cancellationToken, async(pool, token) => + await Parallel.ForEachAsync(tasksPoolMetadataByPoolKey, cancellationToken, async (pool, token) => { var (_, virtualMachineInfo, identities, displayName) = pool.Value.First(); From 805d7ccbfdefafd588f9de86d1db43fa8a030edb Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 26 Jan 2024 23:32:13 -0800 Subject: [PATCH 123/202] format --- src/TesApi.Web/Events/RunnerEventsProcessor.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index 2ef498ea3..c9018294b 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -402,7 +402,7 @@ async IAsyncEnumerable GetProcessLogsAsync(string messageName, Tes.Model var directoryUri = await storageAccessProvider.GetInternalTesTaskBlobUrlAsync(tesTask, string.Empty, Azure.Storage.Sas.BlobSasPermissions.List, cancellationToken); var namePrefixLen = new BlobUriBuilder(directoryUri).BlobName.Length + 1; - await foreach (var(uri, label) in azureProxy.ListBlobsAsync(directoryUri, cancellationToken) + await foreach (var (uri, label) in azureProxy.ListBlobsAsync(directoryUri, cancellationToken) .Where(blob => !blob.BlobName[namePrefixLen..].Contains('/')) // no "subdirectories" .Select(blob => (blob.BlobUri, BlobName: blob.BlobName.Split('/').Last())) // just the name .Where(blob => blob.BlobName.EndsWith(".txt") && blob.BlobName.StartsWith(blobNameStartsWith)) // name starts and ends with expected values From 2065644357a47fcd763b1f4f474f1ad3f25361f5 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 7 Mar 2024 18:52:46 -0800 Subject: [PATCH 124/202] formatting --- src/TesApi.Tests/BatchSchedulerTests.cs | 2 +- src/deploy-tes-on-azure/Deployer.cs | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index d82bf9873..c838bb2c0 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -486,7 +486,7 @@ void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception except break; default: - Assert.Fail("Unexpected TesState: {State}." ,tesTask.State); + Assert.Fail("Unexpected TesState: {State}.", tesTask.State); break; } }); diff --git a/src/deploy-tes-on-azure/Deployer.cs b/src/deploy-tes-on-azure/Deployer.cs index 24c5aa12a..20a9a5f69 100644 --- a/src/deploy-tes-on-azure/Deployer.cs +++ b/src/deploy-tes-on-azure/Deployer.cs @@ -567,7 +567,8 @@ await Task.WhenAll( appInsights = await CreateAppInsightsResourceAsync(configuration.LogAnalyticsArmId); await AssignVmAsContributorToAppInsightsAsync(managedIdentity, appInsights); }), - Task.Run(async () => { + Task.Run(async () => + { postgreSqlFlexServer ??= await CreatePostgreSqlServerAndDatabaseAsync(postgreSqlFlexManagementClient, vnetAndSubnet.Value.postgreSqlSubnet, postgreSqlDnsZone); }) ]); From 7970ec1006b5744f392e9c3f9548afb53744abf8 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 12 Mar 2024 10:55:58 -0700 Subject: [PATCH 125/202] Feedback and minor cleanup --- src/Tes.Runner/Executor.cs | 31 ++++++++++--------- .../Repository/TesTaskPostgreSqlRepository.cs | 3 +- src/TesApi.Web/BatchPool.cs | 26 ++++------------ 3 files changed, 24 insertions(+), 36 deletions(-) diff --git a/src/Tes.Runner/Executor.cs b/src/Tes.Runner/Executor.cs index 6514c002d..c1400c5ca 100644 --- a/src/Tes.Runner/Executor.cs +++ b/src/Tes.Runner/Executor.cs @@ -11,14 +11,14 @@ namespace Tes.Runner { - public class Executor : IAsyncDisposable + public sealed class Executor : IAsyncDisposable { public const long ZeroBytesTransferred = 0; public const long DefaultErrorExitCode = 1; private readonly ILogger logger = PipelineLoggerFactory.Create(); private readonly NodeTask tesNodeTask; private readonly FileOperationResolver operationResolver; - private readonly VolumeBindingsGenerator volumeBindingsGenerator = new VolumeBindingsGenerator(); + private readonly VolumeBindingsGenerator volumeBindingsGenerator = new(); private readonly EventsPublisher eventsPublisher; private readonly ITransferOperationFactory transferOperationFactory; @@ -78,7 +78,7 @@ private ExecutionOptions CreateExecutionOptions(List bindings) return new ExecutionOptions(tesNodeTask.ImageName, tesNodeTask.ImageTag, tesNodeTask.CommandsToExecute, bindings, tesNodeTask.ContainerWorkDir, tesNodeTask.RuntimeOptions); } - private string ToStatusMessage(ContainerExecutionResult result) + private static string ToStatusMessage(ContainerExecutionResult result) { if (result.ExitCode == 0 && string.IsNullOrWhiteSpace(result.Error)) { @@ -148,15 +148,15 @@ public async Task UploadOutputsAsync(BlobPipelineOptions blobPipelineOptio } } - private async Task<(long BytesTransferred, IEnumerable CompletedFiles)> UploadOutputsAsync(BlobPipelineOptions blobPipelineOptions, List outputs) + private async Task UploadOutputsAsync(BlobPipelineOptions blobPipelineOptions, List outputs) { var uploader = await transferOperationFactory.CreateBlobUploaderAsync(blobPipelineOptions); var executionResult = await TimedExecutionAsync(async () => await uploader.UploadAsync(outputs)); - logger.LogInformation($"Executed Upload. Time elapsed: {executionResult.Elapsed} Bandwidth: {BlobSizeUtils.ToBandwidth(executionResult.Result, executionResult.Elapsed.TotalSeconds)} MiB/s"); + logger.LogInformation(@"Executed Upload. Time elapsed: {ElapsedTime} Bandwidth: {BandwidthMiBpS} MiB/s", executionResult.Elapsed, BlobSizeUtils.ToBandwidth(executionResult.Result, executionResult.Elapsed.TotalSeconds)); - return (executionResult.Result, uploader.CompletedFiles); + return new(executionResult.Result, uploader.CompletedFiles); } private async Task?> CreateUploadOutputsAsync() @@ -181,7 +181,7 @@ private BlobPipelineOptions OptimizeBlobPipelineOptionsForUpload(BlobPipelineOpt LogStartConfig(optimizedOptions); - logger.LogInformation($"{outputs.Count} outputs to upload."); + logger.LogInformation("{OutputsCount} outputs to upload.", outputs.Count); return optimizedOptions; } @@ -192,7 +192,7 @@ private BlobPipelineOptions OptimizeBlobPipelineOptionsForDownload(BlobPipelineO LogStartConfig(optimizedOptions); - logger.LogInformation($"{tesNodeTask.Inputs?.Count} inputs to download."); + logger.LogInformation("{InputsCount} inputs to download.", tesNodeTask.Inputs?.Count); return optimizedOptions; } @@ -245,7 +245,7 @@ private async Task DownloadInputsAsync(BlobPipelineOptions blobPipelineOpt var executionResult = await TimedExecutionAsync(async () => await downloader.DownloadAsync(inputs)); - logger.LogInformation($"Executed Download. Time elapsed: {executionResult.Elapsed} Bandwidth: {BlobSizeUtils.ToBandwidth(executionResult.Result, executionResult.Elapsed.TotalSeconds)} MiB/s"); + logger.LogInformation("Executed Download. Time elapsed: {ElapsedTime} Bandwidth: {BandwidthMiBpS} MiB/s", executionResult.Elapsed, BlobSizeUtils.ToBandwidth(executionResult.Result, executionResult.Elapsed.TotalSeconds)); return executionResult.Result; } @@ -273,10 +273,10 @@ private static void ValidateBlockSize(int blockSizeBytes) private void LogStartConfig(BlobPipelineOptions blobPipelineOptions) { - logger.LogInformation($"Writers: {blobPipelineOptions.NumberOfWriters}"); - logger.LogInformation($"Readers: {blobPipelineOptions.NumberOfReaders}"); - logger.LogInformation($"Capacity: {blobPipelineOptions.ReadWriteBuffersCapacity}"); - logger.LogInformation($"BlockSize: {blobPipelineOptions.BlockSizeBytes}"); + logger.LogInformation("Writers: {NumberOfWriters}", blobPipelineOptions.NumberOfWriters); + logger.LogInformation("Readers: {NumberOfReaders}", blobPipelineOptions.NumberOfReaders); + logger.LogInformation("Capacity: {ReadWriteBuffersCapacity}", blobPipelineOptions.ReadWriteBuffersCapacity); + logger.LogInformation("BlockSize: {BlockSizeBytes}", blobPipelineOptions.BlockSizeBytes); } private static async Task> TimedExecutionAsync(Func> execution) @@ -288,9 +288,10 @@ private static async Task> TimedExecutionAsync(Func(sw.Elapsed, result); } - private record TimedExecutionResult(TimeSpan Elapsed, T Result); + private record struct UploadResults(long BytesTransferred, IEnumerable CompletedFiles); + private record struct TimedExecutionResult(TimeSpan Elapsed, T Result); - public async ValueTask DisposeAsync() + async ValueTask IAsyncDisposable.DisposeAsync() { await eventsPublisher.FlushPublishersAsync(); } diff --git a/src/Tes/Repository/TesTaskPostgreSqlRepository.cs b/src/Tes/Repository/TesTaskPostgreSqlRepository.cs index ec6f1c2f8..4b7585e9b 100644 --- a/src/Tes/Repository/TesTaskPostgreSqlRepository.cs +++ b/src/Tes/Repository/TesTaskPostgreSqlRepository.cs @@ -26,7 +26,7 @@ public sealed class TesTaskPostgreSqlRepository : PostgreSqlCachingRepository NpgsqlDataSourceBuilder => connectionString => new Npgsql.NpgsqlDataSourceBuilder(connectionString) - .EnableDynamicJson(jsonbClrTypes: new[] { typeof(TesTask) }) + .EnableDynamicJson(jsonbClrTypes: [typeof(TesTask)]) .Build(); // Configuration of NpgsqlDbContext @@ -149,6 +149,7 @@ public async Task CreateItemAsync(TesTask task, CancellationToken cance /// TesTask to store as JSON in the database /// A for controlling the lifetime of the asynchronous operation. /// + [System.Diagnostics.CodeAnalysis.SuppressMessage("Style", "IDE0305:Simplify collection initialization", Justification = "ToList() is very explicit as well as consise without being confusing.")] public async Task> CreateItemsAsync(List items, CancellationToken cancellationToken) => (await Task.WhenAll(items.Select(task => CreateItemAsync(task, cancellationToken)))).ToList(); diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 0d7467486..37bdca357 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -288,15 +288,15 @@ public static string AutoPoolFormula(bool preemptable, int initialTarget) */ { var targetVariable = preemptable ? "TargetLowPriorityNodes" : "TargetDedicated"; - return string.Join(Environment.NewLine, new[] - { + return string.Join(Environment.NewLine, + [ "$NodeDeallocationOption = taskcompletion;", $"""lifespan = time() - time("{DateTime.UtcNow:r}");""", "span = TimeInterval_Second * 90;", "startup = TimeInterval_Minute * 2;", "ratio = 10;", $"${targetVariable} = (lifespan > startup ? min($PendingTasks.GetSample(span, ratio)) : {initialTarget});" - }); + ]); } private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken cancellationToken) @@ -479,20 +479,6 @@ public async ValueTask CanBeDeletedAsync(CancellationToken cancellationTok return false; } - //await foreach (var node in _azureProxy.ListComputeNodesAsync(PoolId, new ODATADetailLevel(selectClause: "state")).WithCancellation(cancellationToken)) - //{ - // switch (node.State) - // { - // case ComputeNodeState.Rebooting: - // case ComputeNodeState.Reimaging: - // case ComputeNodeState.Running: - // case ComputeNodeState.Creating: - // case ComputeNodeState.Starting: - // case ComputeNodeState.WaitingForStartTask: - // return false; - // } - //} - return true; } @@ -656,8 +642,8 @@ async ValueTask ParseBatchException(BatchException batchException, Cancell } } - private readonly List _foundTasks = new(); - private readonly Dictionary _taskPreviousComputeNodeIds = new(); + private readonly List _foundTasks = []; + private readonly Dictionary _taskPreviousComputeNodeIds = []; private Lazy>> _lazyComputeNodes; private const string EjectableComputeNodesFilterClause = @"state eq 'starttaskfailed' or state eq 'preempted' or state eq 'unusable'"; @@ -719,7 +705,7 @@ await Task.WhenAll( // In the extremely unlikely event that there are no innerexceptions, we don't want to change the existing code flow nor do we want to complicate the (less than 2 inner exceptions) path. if (exception.InnerExceptions?.Count != 1) { - throw new AggregateException(exception.Message, exception.InnerExceptions?.Select(HandleException) ?? Enumerable.Empty()); + throw new AggregateException(exception.Message, exception.InnerExceptions?.Select(HandleException) ?? []); } throw HandleException(exception.InnerException); From 55a6d0c6eb0e7b6d5222af18ce07966ec38e710f Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 13 Mar 2024 21:26:53 -0700 Subject: [PATCH 126/202] fix off-by-one error --- src/TesApi.Web/Events/RunnerEventsProcessor.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index c9018294b..5c4045ff4 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -409,7 +409,7 @@ async IAsyncEnumerable GetProcessLogsAsync(string messageName, Tes.Model .Select(blob => (blob.BlobUri, BlobNameParts: blob.BlobName.Split('_', 4))) // split name into sections .Where(blob => blob.BlobNameParts.Length > 2 && !blob.BlobNameParts.Any(string.IsNullOrWhiteSpace)) // 3 or 4 sections and no sections are empty .OrderBy(blob => string.Join('_', blob.BlobNameParts.Take(3))) // sort by "root" names - .ThenBy(blob => blob.BlobNameParts.Length < 3 ? -1 : int.Parse(blob.BlobNameParts[3][..blob.BlobNameParts[3].IndexOf('.')], System.Globalization.CultureInfo.InvariantCulture)) // then by extended numbers + .ThenBy(blob => blob.BlobNameParts.Length < 4 ? -1 : int.Parse(blob.BlobNameParts[3][..blob.BlobNameParts[3].IndexOf('.')], System.Globalization.CultureInfo.InvariantCulture)) // then by extended numbers .Select(blob => (blob.BlobUri, blob.BlobNameParts[1])) // uri and which standard stream .WithCancellation(cancellationToken)) { From 5b680ee9b5623c421800acda8a9f14842c3044be Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 17 Apr 2024 11:56:41 -0700 Subject: [PATCH 127/202] update dependencies --- src/CommonUtilities.Tests/CommonUtilities.Tests.csproj | 4 ++-- src/CommonUtilities/CommonUtilities.csproj | 2 +- src/GenerateBatchVmSkus/GenerateBatchVmSkus.csproj | 2 +- src/Tes.ApiClients.Tests/Tes.ApiClients.Tests.csproj | 4 ++-- src/Tes.ApiClients/Tes.ApiClients.csproj | 2 +- src/Tes.Runner.Test/Tes.Runner.Test.csproj | 4 ++-- src/Tes.Runner/Tes.Runner.csproj | 2 +- src/Tes.SDK.Tests/Tes.SDK.Tests.csproj | 4 ++-- src/TesApi.Tests/TesApi.Tests.csproj | 4 ++-- src/TesApi.Web/TesApi.Web.csproj | 4 ++-- .../deploy-tes-on-azure.Tests.csproj | 4 ++-- src/deploy-tes-on-azure/deploy-tes-on-azure.csproj | 6 +++--- 12 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/CommonUtilities.Tests/CommonUtilities.Tests.csproj b/src/CommonUtilities.Tests/CommonUtilities.Tests.csproj index f7fa899ed..f0dff427d 100644 --- a/src/CommonUtilities.Tests/CommonUtilities.Tests.csproj +++ b/src/CommonUtilities.Tests/CommonUtilities.Tests.csproj @@ -12,8 +12,8 @@ - - + + all runtime; build; native; contentfiles; analyzers; buildtransitive diff --git a/src/CommonUtilities/CommonUtilities.csproj b/src/CommonUtilities/CommonUtilities.csproj index da8f92798..5d5b57cf2 100644 --- a/src/CommonUtilities/CommonUtilities.csproj +++ b/src/CommonUtilities/CommonUtilities.csproj @@ -7,7 +7,7 @@ - + diff --git a/src/GenerateBatchVmSkus/GenerateBatchVmSkus.csproj b/src/GenerateBatchVmSkus/GenerateBatchVmSkus.csproj index df0a49d00..1298db240 100644 --- a/src/GenerateBatchVmSkus/GenerateBatchVmSkus.csproj +++ b/src/GenerateBatchVmSkus/GenerateBatchVmSkus.csproj @@ -8,7 +8,7 @@ - + diff --git a/src/Tes.ApiClients.Tests/Tes.ApiClients.Tests.csproj b/src/Tes.ApiClients.Tests/Tes.ApiClients.Tests.csproj index 209d63cd9..f01e8e866 100644 --- a/src/Tes.ApiClients.Tests/Tes.ApiClients.Tests.csproj +++ b/src/Tes.ApiClients.Tests/Tes.ApiClients.Tests.csproj @@ -15,8 +15,8 @@ - - + + diff --git a/src/Tes.ApiClients/Tes.ApiClients.csproj b/src/Tes.ApiClients/Tes.ApiClients.csproj index bf86c5f11..2fcc79c13 100644 --- a/src/Tes.ApiClients/Tes.ApiClients.csproj +++ b/src/Tes.ApiClients/Tes.ApiClients.csproj @@ -8,7 +8,7 @@ - + diff --git a/src/Tes.Runner.Test/Tes.Runner.Test.csproj b/src/Tes.Runner.Test/Tes.Runner.Test.csproj index ff78d5309..249bae8bc 100644 --- a/src/Tes.Runner.Test/Tes.Runner.Test.csproj +++ b/src/Tes.Runner.Test/Tes.Runner.Test.csproj @@ -14,8 +14,8 @@ - - + + all runtime; build; native; contentfiles; analyzers; buildtransitive diff --git a/src/Tes.Runner/Tes.Runner.csproj b/src/Tes.Runner/Tes.Runner.csproj index 206b6af41..42e50d55a 100644 --- a/src/Tes.Runner/Tes.Runner.csproj +++ b/src/Tes.Runner/Tes.Runner.csproj @@ -8,7 +8,7 @@ - + diff --git a/src/Tes.SDK.Tests/Tes.SDK.Tests.csproj b/src/Tes.SDK.Tests/Tes.SDK.Tests.csproj index 1c2314841..2f79e7fd0 100644 --- a/src/Tes.SDK.Tests/Tes.SDK.Tests.csproj +++ b/src/Tes.SDK.Tests/Tes.SDK.Tests.csproj @@ -16,8 +16,8 @@ - - + + diff --git a/src/TesApi.Tests/TesApi.Tests.csproj b/src/TesApi.Tests/TesApi.Tests.csproj index 4eb2bfe0f..f61d0c05e 100644 --- a/src/TesApi.Tests/TesApi.Tests.csproj +++ b/src/TesApi.Tests/TesApi.Tests.csproj @@ -19,8 +19,8 @@ - - + + diff --git a/src/TesApi.Web/TesApi.Web.csproj b/src/TesApi.Web/TesApi.Web.csproj index a887eed73..b17b4e3d2 100644 --- a/src/TesApi.Web/TesApi.Web.csproj +++ b/src/TesApi.Web/TesApi.Web.csproj @@ -12,12 +12,12 @@ - + - + diff --git a/src/deploy-tes-on-azure.Tests/deploy-tes-on-azure.Tests.csproj b/src/deploy-tes-on-azure.Tests/deploy-tes-on-azure.Tests.csproj index f1fe5bca3..28245e132 100644 --- a/src/deploy-tes-on-azure.Tests/deploy-tes-on-azure.Tests.csproj +++ b/src/deploy-tes-on-azure.Tests/deploy-tes-on-azure.Tests.csproj @@ -9,8 +9,8 @@ - - + + diff --git a/src/deploy-tes-on-azure/deploy-tes-on-azure.csproj b/src/deploy-tes-on-azure/deploy-tes-on-azure.csproj index 42547bebd..f73690f01 100644 --- a/src/deploy-tes-on-azure/deploy-tes-on-azure.csproj +++ b/src/deploy-tes-on-azure/deploy-tes-on-azure.csproj @@ -18,9 +18,9 @@ - + - + @@ -43,7 +43,7 @@ - + From 8e15f6d0acfd7e5d97dad2bc67a3fa272a0516c4 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 25 Apr 2024 13:21:19 -0700 Subject: [PATCH 128/202] Weird test failure --- src/TesApi.Tests/BatchSchedulerTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index e2b8d7950..9096b72da 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -491,7 +491,7 @@ void Validator(TesTask tesTask, IEnumerable<(LogLevel logLevel, Exception except break; default: - Assert.Fail("Unexpected TesState: {State}.", tesTask.State); + Assert.Fail($"Unexpected TesState: {tesTask.State}."); break; } }); From b969fe3eaaf3c7e5e82c290d274440743f81aff5 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 26 Apr 2024 10:56:25 -0700 Subject: [PATCH 129/202] Add task start event to the processed events --- src/Tes.Runner/Events/EventMessage.cs | 2 ++ src/TesApi.Web/AzureBatchTaskState.cs | 6 ++-- src/TesApi.Web/BatchScheduler.cs | 11 +------ .../Events/RunnerEventsProcessor.cs | 32 +++++++++++++------ 4 files changed, 27 insertions(+), 24 deletions(-) diff --git a/src/Tes.Runner/Events/EventMessage.cs b/src/Tes.Runner/Events/EventMessage.cs index 565d2f0fc..9ef9ce59a 100644 --- a/src/Tes.Runner/Events/EventMessage.cs +++ b/src/Tes.Runner/Events/EventMessage.cs @@ -27,8 +27,10 @@ public sealed class EventMessage [JsonPropertyName("resources")] public List? Resources { get; set; } + [JsonPropertyName("created")] public DateTime Created { get; set; } + [JsonPropertyName("eventVersion")] public string EventVersion { get; set; } = null!; diff --git a/src/TesApi.Web/AzureBatchTaskState.cs b/src/TesApi.Web/AzureBatchTaskState.cs index 7acd64a67..8138d41f6 100644 --- a/src/TesApi.Web/AzureBatchTaskState.cs +++ b/src/TesApi.Web/AzureBatchTaskState.cs @@ -23,7 +23,6 @@ namespace TesApi.Web /// . /// . /// Warning. First item in enumeration is the Warning code, rest of items are additional system log entries. - /// Replace previous logged value of the task start time with . public record class AzureBatchTaskState( TaskState State, IEnumerable OutputFileLogs = default, @@ -35,8 +34,7 @@ public record class AzureBatchTaskState( int? ExecutorExitCode = default, DateTimeOffset? BatchTaskEndTime = default, int? BatchTaskExitCode = default, - IEnumerable Warning = default, - bool ReplaceBatchTaskStartTime = default) + IEnumerable Warning = default) { /// /// Unknown error. Either an exception or a missing reason. @@ -105,7 +103,7 @@ public enum TaskState NodeAllocationFailed, /// - /// Azure Batch pre-empted the execution of this task while running on a low-priority node + /// Azure Batch preempted the execution of this task while running on a low-priority node /// NodePreempted, diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 026527bd2..271c3ac13 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -17,7 +17,6 @@ using Microsoft.Extensions.Options; using Microsoft.Extensions.Primitives; using Tes.Extensions; -using Tes.Models; using TesApi.Web.Events; using TesApi.Web.Extensions; using TesApi.Web.Management; @@ -208,6 +207,7 @@ async Task SetTaskStateAndLogAsync(TesTask tesTask, TesState newTaskState, tesTaskLog.BatchNodeMetrics ??= batchNodeMetrics; tesTaskLog.CromwellResultCode ??= cromwellRcCode; tesTaskLog.EndTime ??= batchInfo.BatchTaskEndTime ?? taskEndTime; + tesTaskLog.StartTime ??= batchInfo.BatchTaskStartTime ?? taskStartTime; if (batchInfo.ExecutorEndTime is not null || batchInfo.ExecutorStartTime is not null || batchInfo.ExecutorExitCode is not null) { @@ -217,15 +217,6 @@ async Task SetTaskStateAndLogAsync(TesTask tesTask, TesState newTaskState, tesTaskExecutorLog.ExitCode ??= batchInfo.ExecutorExitCode; } - if (batchInfo.ReplaceBatchTaskStartTime) - { - tesTaskLog.StartTime = batchInfo.BatchTaskStartTime ?? taskStartTime; - } - else - { - tesTaskLog.StartTime ??= batchInfo.BatchTaskStartTime ?? taskStartTime; - } - if (batchInfo.OutputFileLogs is not null) { tesTaskLog.Outputs ??= []; diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index c61b533aa..796e3e6b8 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -135,6 +135,12 @@ public async Task DownloadAndValidateMessageContentAsync(Ru // Event type specific content validations switch (content.Name) { + case Tes.Runner.Events.EventsPublisher.TaskCommencementEvent: + Validate(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(content.StatusMessage, StringComparison.Ordinal), + $"{nameof(content.StatusMessage)}('{content.StatusMessage}') does not match the expected value of '{Tes.Runner.Events.EventsPublisher.StartedStatus}'."); + ValidateCreated(); + break; + case Tes.Runner.Events.EventsPublisher.DownloadStartEvent: Validate(Tes.Runner.Events.EventsPublisher.StartedStatus.Equals(content.StatusMessage, StringComparison.Ordinal), $"{nameof(content.StatusMessage)}('{content.StatusMessage}') does not match the expected value of '{Tes.Runner.Events.EventsPublisher.StartedStatus}'."); @@ -209,12 +215,14 @@ static void Validate([System.Diagnostics.CodeAnalysis.DoesNotReturnIf(false)] bo private static readonly IReadOnlyDictionary EventsInOrder = new Dictionary(StringComparer.OrdinalIgnoreCase) { - { Tes.Runner.Events.EventsPublisher.DownloadStartEvent, int.MinValue }, - { Tes.Runner.Events.EventsPublisher.DownloadEndEvent, int.MinValue + 1 }, + { Tes.Runner.Events.EventsPublisher.TaskCommencementEvent, int.MinValue }, + { Tes.Runner.Events.EventsPublisher.DownloadStartEvent, int.MinValue + 1 }, + { Tes.Runner.Events.EventsPublisher.DownloadEndEvent, int.MinValue + 2 }, { Tes.Runner.Events.EventsPublisher.ExecutorStartEvent, -1 }, { Tes.Runner.Events.EventsPublisher.ExecutorEndEvent, +1 }, - { Tes.Runner.Events.EventsPublisher.UploadStartEvent, int.MaxValue - 1 }, - { Tes.Runner.Events.EventsPublisher.UploadEndEvent, int.MaxValue }, + { Tes.Runner.Events.EventsPublisher.UploadStartEvent, int.MaxValue - 2 }, + { Tes.Runner.Events.EventsPublisher.UploadEndEvent, int.MaxValue - 1 }, + { Tes.Runner.Events.EventsPublisher.TaskCompletionEvent, int.MaxValue }, }.AsReadOnly(); /// @@ -230,7 +238,13 @@ public IEnumerable OrderProcessedByExecutorSequence(IEnumerable source, ArgumentNullException.ThrowIfNull(source); ArgumentNullException.ThrowIfNull(messageGetter); - return source.OrderBy(t => OrderBy(messageGetter(t))).ThenBy(t => ThenBy(messageGetter(t))); + return source.OrderBy(OrderByT).ThenBy(ThenByT); + + DateTime OrderByT(T item) + => OrderBy(messageGetter(item)); + + int ThenByT(T item) + => ThenBy(messageGetter(item)); static DateTime OrderBy(RunnerEventsMessage message) => (message.RunnerEventMessage?.Created ?? DateTime.Parse(message.Tags["created"])).ToUniversalTime(); @@ -261,9 +275,11 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes return (nodeMessage.Name ?? message.Event) switch { - Tes.Runner.Events.EventsPublisher.DownloadStartEvent => new AzureBatchTaskState(AzureBatchTaskState.TaskState.Initializing, + Tes.Runner.Events.EventsPublisher.TaskCommencementEvent => new AzureBatchTaskState(AzureBatchTaskState.TaskState.Initializing, BatchTaskStartTime: nodeMessage.Created), + Tes.Runner.Events.EventsPublisher.DownloadStartEvent => new AzureBatchTaskState(AzureBatchTaskState.TaskState.NoChange), + Tes.Runner.Events.EventsPublisher.DownloadEndEvent => nodeMessage.StatusMessage switch { Tes.Runner.Events.EventsPublisher.SuccessStatus => new(AzureBatchTaskState.TaskState.NoChange), @@ -325,8 +341,6 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes { Tes.Runner.Events.EventsPublisher.SuccessStatus => new( AzureBatchTaskState.TaskState.CompletedSuccessfully, - ReplaceBatchTaskStartTime: true, - BatchTaskStartTime: nodeMessage.Created - TimeSpan.Parse(nodeMessage.EventData["duration"]), BatchTaskEndTime: nodeMessage.Created), Tes.Runner.Events.EventsPublisher.FailedStatus => new( @@ -335,8 +349,6 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes Enumerable.Empty() .Append("Node script failed.") .Append(nodeMessage.EventData["errorMessage"])), - ReplaceBatchTaskStartTime: true, - BatchTaskStartTime: nodeMessage.Created - TimeSpan.Parse(nodeMessage.EventData["duration"]), BatchTaskEndTime: nodeMessage.Created), _ => throw new System.Diagnostics.UnreachableException(), From e8d692c959d29b1752ea4979e1a2a7c0b26e222a Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 26 Apr 2024 11:12:05 -0700 Subject: [PATCH 130/202] Fix System.ArgumentOutOfRangeException: Specified argument was out of the range of valid values. (Parameter 'value') --- src/TesApi.Web/AzureProxy.cs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 840a6f279..458323f89 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -15,9 +15,7 @@ using Microsoft.Azure.Batch; using Microsoft.Azure.Batch.Auth; using Microsoft.Azure.Batch.Common; -using Microsoft.Azure.Management.ResourceManager.Fluent; using Microsoft.Azure.Management.ResourceManager.Fluent.Authentication; -using Microsoft.Azure.Services.AppAuthentication; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Microsoft.Rest; @@ -29,9 +27,7 @@ using TesApi.Web.Storage; using static CommonUtilities.RetryHandler; using BatchModels = Microsoft.Azure.Management.Batch.Models; -using CloudTask = Microsoft.Azure.Batch.CloudTask; using FluentAzure = Microsoft.Azure.Management.Fluent.Azure; -using OnAllTasksComplete = Microsoft.Azure.Batch.Common.OnAllTasksComplete; namespace TesApi.Web { @@ -182,7 +178,7 @@ public async Task AddBatchTasksAsync(IEnumerable cloudTasks, string j batchClient.JobOperations.GetJobAsync(jobId, cancellationToken: ct), cancellationToken); - await job.AddTaskAsync(cloudTasks, new() { CancellationToken = cancellationToken, MaxDegreeOfParallelism = (int)Math.Ceiling((double)cloudTasks.Count() / Microsoft.Azure.Batch.Constants.MaxTasksInSingleAddTaskCollectionRequest) }); + await job.AddTaskAsync(cloudTasks, new() { CancellationToken = cancellationToken, MaxDegreeOfParallelism = (int)Math.Ceiling((double)cloudTasks.Count() / Constants.MaxTasksInSingleAddTaskCollectionRequest) + 1 }); } /// From 6a355f6ee400c8b210196fb7df0c4f3ff5641a5c Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 15 May 2024 11:49:16 -0700 Subject: [PATCH 131/202] cleanup --- src/TesApi.Web/BatchPoolService.cs | 4 +--- src/TesApi.Web/BatchPools.BatchScheduler.cs | 4 ++-- src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs index 0d5e94735..d8702315a 100644 --- a/src/TesApi.Web/BatchPoolService.cs +++ b/src/TesApi.Web/BatchPoolService.cs @@ -7,7 +7,6 @@ using System.Threading.Tasks; using Microsoft.Extensions.Hosting; using Microsoft.Extensions.Logging; -using Microsoft.Extensions.Options; namespace TesApi.Web { @@ -29,11 +28,10 @@ public class BatchPoolService : BackgroundService /// /// Default constructor /// - /// Configuration of /// /// /// - public BatchPoolService(IOptions batchSchedulingOptions, IBatchScheduler batchScheduler, ILogger logger) + public BatchPoolService(IBatchScheduler batchScheduler, ILogger logger) { _batchScheduler = batchScheduler ?? throw new ArgumentNullException(nameof(batchScheduler)); _logger = logger ?? throw new ArgumentNullException(nameof(logger)); diff --git a/src/TesApi.Web/BatchPools.BatchScheduler.cs b/src/TesApi.Web/BatchPools.BatchScheduler.cs index 2dfcdad84..f8d680de7 100644 --- a/src/TesApi.Web/BatchPools.BatchScheduler.cs +++ b/src/TesApi.Web/BatchPools.BatchScheduler.cs @@ -90,8 +90,8 @@ var x when char.IsAsciiLetterOrDigit(x) => x, } } - private readonly BatchPools batchPools = new(); - private readonly HashSet neededPools = new(); + private readonly BatchPools batchPools = []; + private readonly HashSet neededPools = []; /// public bool NeedPoolFlush diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 2db20b581..51aded3c8 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -188,7 +188,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< logs.Add($"BatchErrorCode: {requestInfo.BatchError.Code}"); logs.Add($"BatchErrorMessage ({requestInfo.BatchError.Message.Language}): {requestInfo.BatchError.Message.Value}"); - foreach (var detail in requestInfo.BatchError.Values?.Select(d => $"BatchErrorDetail: '{d.Key}': '{d.Value}'") ?? Enumerable.Empty()) + foreach (var detail in requestInfo.BatchError.Values?.Select(d => $"BatchErrorDetail: '{d.Key}': '{d.Value}'") ?? []) { logs.Add(detail); } From ff154858875c8bb73f290ac60114d3e9cbb1d8b7 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 16 May 2024 15:16:40 -0700 Subject: [PATCH 132/202] Reduce load on storage account --- src/TesApi.Web/TaskScheduler.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index a6488cf24..3f5fc0514 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -27,8 +27,8 @@ namespace TesApi.Web internal class TaskScheduler(RunnerEventsProcessor nodeEventProcessor, Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger taskSchedulerLogger) : OrchestrateOnBatchSchedulerServiceBase(hostApplicationLifetime, repository, batchScheduler, taskSchedulerLogger) { - private readonly TimeSpan blobRunInterval = TimeSpan.FromSeconds(5); - private readonly TimeSpan batchRunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs uses a 30 second polling interval + private readonly TimeSpan blobRunInterval = TimeSpan.FromSeconds(15); + private readonly TimeSpan batchRunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs appears to use a 30 second polling interval private readonly RunnerEventsProcessor nodeEventProcessor = nodeEventProcessor; /// From 01faf622dc0514f34162e08fafa3c5294ecaf97d Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 17 May 2024 11:15:26 -0700 Subject: [PATCH 133/202] Give enough time for TES to accept connections --- src/deploy-tes-on-azure/Deployer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/deploy-tes-on-azure/Deployer.cs b/src/deploy-tes-on-azure/Deployer.cs index 20d3f506f..64179c500 100644 --- a/src/deploy-tes-on-azure/Deployer.cs +++ b/src/deploy-tes-on-azure/Deployer.cs @@ -651,7 +651,7 @@ await Execute( kubernetesManager.ExecKubectlProcessAsync($"port-forward -n {configuration.AksCoANamespace} svc/tes 8088:80", token, appendKubeconfig: true)); var portForwardTask = startPortForward(tokenSource.Token); - await Task.Delay(longRetryWaitTime * 2, tokenSource.Token); // Give enough time for kubectl to standup the port forwarding. + await Task.Delay(TimeSpan.FromMinutes(3.5), tokenSource.Token); // Give enough time for kubectl to standup the port forwarding, as well as enough time for TES to accept connections. var runTestTask = RunTestTaskAsync("localhost:8088", batchAccount.LowPriorityCoreQuota > 0); for (var task = await Task.WhenAny(portForwardTask, runTestTask); From ec76b3c9c85293609daf6b4df8c92f2c5cb86ca9 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 30 May 2024 12:49:45 -0700 Subject: [PATCH 134/202] Address terminal condition where cancelled task was completed --- .../OrchestrateOnBatchSchedulerServiceBase.cs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 51aded3c8..38fefb575 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -165,7 +165,8 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< } catch (Exception exc) { - if (++tesTask.ErrorCount > 3) // TODO: Should we increment this for exceptions here (current behaviour) or the attempted executions on the batch? + if (++tesTask.ErrorCount > 3 || // TODO: Should we increment this for exceptions here (current behavior) or the attempted executions on the batch? + IsExceptionHttpConflictWhereTaskIsComplete(exc)) { tesTask.State = TesState.SYSTEM_ERROR; tesTask.EndTime = DateTimeOffset.UtcNow; @@ -275,6 +276,17 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func< } Logger.LogDebug("OrchestrateTesTasksOnBatch({Poll}) for {TaskCount} {UnitsLabel} completed in {TotalSeconds:c}.", pollName, tesTasks.Where(task => task is not null).Count(), unitsLabel, DateTime.UtcNow.Subtract(startTime)); + + static bool IsExceptionHttpConflictWhereTaskIsComplete(Exception exc) + { + if (exc is Microsoft.Azure.Batch.Common.BatchException batchException) + { + return System.Net.HttpStatusCode.Conflict.Equals(batchException.RequestInformation?.HttpStatusCode) && + Microsoft.Azure.Batch.Common.BatchErrorCodeStrings.TaskCompleted.Equals(batchException.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase); + } + + return false; + } } } } From 9feb6f176b05fe6d02c7b5af8fc6e1903149d51d Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 24 Jul 2024 17:54:15 -0700 Subject: [PATCH 135/202] Fix unit tests --- src/TesApi.Tests/BatchSchedulerTests.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index d94b604c7..924f9ec70 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -742,7 +742,7 @@ public async Task BatchJobContainsExpectedBatchPoolInformation() GuardAssertsWithTesTask(tesTask, () => { - Assert.AreEqual("TES-hostname-edicated1-rpsd645merzfkqmdnj7pkqrase2ancnh-", tesTask.PoolId[0..^8]); + Assert.AreEqual("TES-hostname-edicated1-obkfufnroslrzwlitqbrmjeowu7iuhfm-", tesTask.PoolId[0..^8]); Assert.AreEqual("VmSizeDedicated1", pool.VmSize); Assert.IsTrue(((BatchScheduler)batchScheduler).TryGetPool(tesTask.PoolId, out _)); }); @@ -1025,7 +1025,7 @@ public async Task TaskGetsCancelled() GuardAssertsWithTesTask(tesTask, () => { - Assert.AreEqual(TesState.CANCELING, tesTask.State); + Assert.AreEqual(TesState.CANCELED, tesTask.State); azureProxy.Verify(i => i.TerminateBatchTaskAsync(tesTask.Id, It.IsAny(), It.IsAny())); }); } @@ -1357,7 +1357,7 @@ public async Task PoolIsCreatedWithoutPublicIpWhenSubnetAndDisableBatchNodesPubl await foreach (var _ in batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, Enumerable.Repeat(azureProxyReturnValues.BatchTaskState, tesTasks.Length).ToArray(), CancellationToken.None)) { } } - var createBatchPoolAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IBatchPoolManager.CreateBatchPoolAsync)); + var createBatchPoolAsyncInvocation = serviceProvider.BatchPoolManager.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IBatchPoolManager.CreateBatchPoolAsync)); var addBatchTaskAsyncInvocation = serviceProvider.AzureProxy.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IAzureProxy.AddBatchTasksAsync)); var jobId = (addBatchTaskAsyncInvocation?.Arguments[1]) as string; From 5ff496139f2c8c5c3e712d2647c414fdb27dbff3 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 25 Jul 2024 13:33:42 -0700 Subject: [PATCH 136/202] Fix issues in MapLocalPathToSasUrlAsync --- .../Storage/DefaultStorageAccessProvider.cs | 28 +++++++++---------- .../Storage/TerraStorageAccessProvider.cs | 2 +- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs index e94f228a0..4fbb00a1d 100644 --- a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs @@ -100,10 +100,10 @@ public override async Task MapLocalPathToSasUrlAsync(string path, BlobSasPe path = $"/{storageOptions.DefaultAccountName}{path}"; } - //TODO: refactor this to throw an exception instead of logging and error and returning null. if (!StorageAccountUrlSegments.TryCreate(path, out var pathSegments)) { - Logger.LogError("Could not parse path '{UnparsablePath}'.", path); + //TODO: refactor this to throw an exception instead of logging an error and returning null. + Logger.LogError("URL does not appear to point to known azure storage."); // Not printing {path} because of "Log entries created from user input" CodeQL report. return null; } @@ -111,19 +111,19 @@ public override async Task MapLocalPathToSasUrlAsync(string path, BlobSasPe { return new StorageAccountUrlSegments(externalStorageAccountInfo.BlobEndpoint, pathSegments.ContainerName, pathSegments.BlobName, externalStorageAccountInfo.SasToken).ToUri(); } - else + + try { - try - { - var result = pathSegments.IsContainer - ? await AddSasTokenAsync(pathSegments, sasTokenDuration, ConvertSasPermissions(sasPermissions, nameof(sasPermissions)), path: path, cancellationToken: cancellationToken) - : await AddSasTokenAsync(pathSegments, sasTokenDuration, sasPermissions, path: path, cancellationToken: cancellationToken); - return result.ToUri(); - } - catch - { - return null; - } + var result = pathSegments.IsContainer + ? await AddSasTokenAsync(pathSegments, sasTokenDuration, ConvertSasPermissions(sasPermissions, nameof(sasPermissions)), path: path, cancellationToken: cancellationToken) + : await AddSasTokenAsync(pathSegments, sasTokenDuration, sasPermissions, path: path, cancellationToken: cancellationToken); + return result.ToUri(); + } + catch (Exception ex) + { + //TODO: refactor this to throw an exception instead of logging an error and returning null. + Logger.LogError(ex, "SAS token could not be obtained: {FailureMessage}.", ex.Message); + return null; } } diff --git a/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs b/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs index 7c83a02b1..a03bcb1de 100644 --- a/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs @@ -106,7 +106,7 @@ public override async Task MapLocalPathToSasUrlAsync(string path, BlobSasPe var terraBlobInfo = await GetTerraBlobInfoFromContainerNameAsync(path, cancellationToken); - if (sasPermissions.HasFlag(BlobSasPermissions.List)) + if (sasPermissions.HasFlag(BlobSasPermissions.List) || string.IsNullOrWhiteSpace(terraBlobInfo.BlobName)) { return await GetMappedSasContainerUrlFromWsmAsync(terraBlobInfo, false, cancellationToken); } From bf935cadf2f9ecf188e59deecf0014f8baead2d2 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 16 Aug 2024 12:27:24 -0700 Subject: [PATCH 137/202] Rename EmumerableExtensions and make queued task bulk processing more granular --- src/TesApi.Tests/BatchSchedulerTests.cs | 50 +- src/TesApi.Web/BatchScheduler.cs | 651 ++++++++++-------- ...eExtensions.cs => EnumerableExtensions.cs} | 2 +- src/TesApi.Web/IBatchScheduler.cs | 17 +- .../OrchestrateOnBatchSchedulerServiceBase.cs | 227 +++--- src/TesApi.Web/PoolScheduler.cs | 2 +- src/TesApi.Web/TaskScheduler.cs | 36 +- 7 files changed, 570 insertions(+), 415 deletions(-) rename src/TesApi.Web/Extensions/{EmumerableExtensions.cs => EnumerableExtensions.cs} (98%) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 924f9ec70..b7be10697 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -735,14 +735,17 @@ public async Task BatchJobContainsExpectedBatchPoolInformation() GetMockAllowedVms(config)); var batchScheduler = serviceProvider.GetT(); - await foreach (var _ in batchScheduler.ProcessQueuedTesTasksAsync([tesTask], CancellationToken.None)) { } + { + await using var schedulerBackgroundTasks = new PerformBatchSchedulerBackgroundTasks(batchScheduler); + _ = await batchScheduler.ProcessQueuedTesTaskAsync(tesTask, CancellationToken.None); + } var createBatchPoolAsyncInvocation = serviceProvider.BatchPoolManager.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IBatchPoolManager.CreateBatchPoolAsync)); var pool = createBatchPoolAsyncInvocation?.Arguments[0] as BatchAccountPoolData; GuardAssertsWithTesTask(tesTask, () => { - Assert.AreEqual("TES-hostname-edicated1-obkfufnroslrzwlitqbrmjeowu7iuhfm-", tesTask.PoolId[0..^8]); + Assert.AreEqual("TES-hostname-edicated1-vibnmgoytavzom4xpzeq56u7xkqevpks-", tesTask.PoolId[0..^8]); Assert.AreEqual("VmSizeDedicated1", pool.VmSize); Assert.IsTrue(((BatchScheduler)batchScheduler).TryGetPool(tesTask.PoolId, out _)); }); @@ -1350,7 +1353,8 @@ public async Task PoolIsCreatedWithoutPublicIpWhenSubnetAndDisableBatchNodesPubl if (azureProxyReturnValues.BatchTaskState is null) { - await foreach (var _ in batchScheduler.ProcessQueuedTesTasksAsync(tesTasks, CancellationToken.None)) { } + await using PerformBatchSchedulerBackgroundTasks schedulerBackgroundTasks = new(batchScheduler); + await Parallel.ForEachAsync(tesTasks, async (task, token) => _ = await batchScheduler.ProcessQueuedTesTaskAsync(task, token)); } else { @@ -1621,6 +1625,46 @@ internal static async ValueTask GuardAssertsWithTesTask(TesTask tesTask, Func Identities, string PoolDisplayName); - private record struct QueuedTaskJobMetadata(string PoolKey, string JobId, VirtualMachineInformation VirtualMachineInfo, IEnumerable Tasks); - private record struct QueuedTaskMetadata(string PoolKey, IEnumerable Tasks); + // Collections and records managing the processing of TesTasks in Queued status + private record struct PendingCloudTask(CloudTask CloudTask, TaskCompletionSource TaskCompletion); + private record struct PendingPoolRequest(string PoolKey, VirtualMachineInformation VirtualMachineInfo, IList Identities, string PoolDisplayName, TaskCompletionSource TaskCompletion); + private record struct PendingPool(string PoolKey, VirtualMachineInformation VirtualMachineInfo, IList Identities, string PoolDisplayName, int InitialTarget, IEnumerable> TaskCompletions); + private record struct ImmutableQueueWithTimer(Timer Timer, ImmutableQueue Queue); + + private readonly ConcurrentDictionary> _queuedTesTaskPendingTasksByJob = new(); + private readonly ConcurrentDictionary> _queuedTesTaskPendingPoolsByKey = new(); + private readonly ConcurrentQueue<(string JobId, IList Tasks)> _queuedTesTaskPendingJobBatches = new(); + private readonly ConcurrentQueue _queuedTesTaskPendingPoolQuotas = new(); + private readonly ConcurrentQueue _queuedTesTaskPendingPools = new(); /// - public async IAsyncEnumerable> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, [System.Runtime.CompilerServices.EnumeratorCancellation] CancellationToken cancellationToken) + public async Task ProcessQueuedTesTaskAsync(TesTask tesTask, CancellationToken cancellationToken) { - ConcurrentBag> results = []; // Early item return facilitator. TaskCatchException() & TaskCatchAggregateException() add items to this. - ConcurrentDictionary> tasksPoolMetadataByPoolKey = new(); - - var acrPullIdentity = await actionIdentityProvider.GetAcrPullActionIdentity(CancellationToken.None); + string poolKey = default; + try { - logger.LogDebug(@"Checking quota for {QueuedTasks} tasks.", tesTasks.Length); + var identities = new List(); - // Determine how many nodes in each pool we might need for this group. - await Parallel.ForEachAsync(tesTasks, cancellationToken, async (tesTask, token) => + if (!string.IsNullOrWhiteSpace(globalManagedIdentity)) { - string poolKey = default; - var identities = new List(); + identities.Add(globalManagedIdentity); + } - if (!string.IsNullOrWhiteSpace(globalManagedIdentity)) + if (tesTask.Resources?.ContainsBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity) == true) + { + var workflowId = tesTask.Resources?.GetBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity); + + if (!NodeTaskBuilder.IsValidManagedIdentityResourceId(workflowId)) { - identities.Add(globalManagedIdentity); + workflowId = azureProxy.GetManagedIdentityInBatchAccountResourceGroup(workflowId); } - if (tesTask.Resources?.ContainsBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity) == true) - { - var workflowId = tesTask.Resources?.GetBackendParameterValue(TesResources.SupportedBackendParameters.workflow_execution_identity); + identities.Add(workflowId); + } - if (!NodeTaskBuilder.IsValidManagedIdentityResourceId(workflowId)) - { - workflowId = azureProxy.GetManagedIdentityInBatchAccountResourceGroup(workflowId); - } + // acrPullIdentity is special. Add it to the end of the list even if it is null, so it is always retrievable. + identities.Add(await actionIdentityProvider.GetAcrPullActionIdentity(cancellationToken)); - identities.Add(workflowId); - } + logger.LogDebug(@"Checking quota for {TesTask}.", tesTask.Id); - if (acrPullIdentity is not null) - { - identities.Add(acrPullIdentity); - } + var virtualMachineInfo = await GetVmSizeAsync(tesTask, cancellationToken); + (poolKey, var displayName) = GetPoolKey(tesTask, virtualMachineInfo, identities); + await quotaVerifier.CheckBatchAccountQuotasAsync(virtualMachineInfo, needPoolOrJobQuotaCheck: !IsPoolAvailable(poolKey), cancellationToken: cancellationToken); - try - { - var virtualMachineInfo = await GetVmSizeAsync(tesTask, token); - (poolKey, var displayName) = GetPoolKey(tesTask, virtualMachineInfo, identities); - await quotaVerifier.CheckBatchAccountQuotasAsync(virtualMachineInfo, needPoolOrJobQuotaCheck: !IsPoolAvailable(poolKey), cancellationToken: token); + // double await because the method call returns a System.Task. When that Task returns, the TesTask has been queued to a job and a pool exists to run that job's tasks + await await AttachQueuedTesTaskToBatchPoolAsync(poolKey, tesTask, virtualMachineInfo, identities, displayName, cancellationToken); - _ = tasksPoolMetadataByPoolKey.AddOrUpdate(poolKey, - _ => [new(tesTask, virtualMachineInfo, identities, displayName)], - (_, list) => list.Add(new(tesTask, virtualMachineInfo, identities, displayName))); - } - catch (AggregateException aggregateException) + var tesTaskLog = tesTask.GetOrAddTesTaskLog(); + tesTaskLog.StartTime = DateTimeOffset.UtcNow; + tesTask.State = TesState.INITIALIZING; + return true; + } + catch (AggregateException aggregateException) + { + var result = false; + var exceptions = new List(); + + foreach (var partResult in aggregateException.InnerExceptions + .Select(ex => QueuedTesTaskHandleExceptionAsync(ex, poolKey, tesTask))) + { + if (partResult.IsFaulted) { - TaskCatchAggregateException(aggregateException.Flatten().InnerExceptions, tesTask, poolKey); + exceptions.Add(partResult.Exception); } - catch (Exception exception) + else { - TaskCatchException(exception, tesTask, poolKey); + result |= partResult.Result; } - }); - } + } - // Return any results that are ready - foreach (var result in results) - { - yield return result; + if (exceptions.Count == 0) + { + return result; + } + else + { + throw new AggregateException(exceptions); + } } - - if (tasksPoolMetadataByPoolKey.IsEmpty) + catch (Exception exception) { - yield break; - } + var result = QueuedTesTaskHandleExceptionAsync(exception, poolKey, tesTask); - results.Clear(); + if (result.IsFaulted) + { + throw result.Exception; + } + else + { + return result.Result; + } + } + } - // Determine how many nodes in each possibly new pool we might need for this group of tasks. - var neededPoolNodesByPoolKey = tasksPoolMetadataByPoolKey.ToDictionary(t => t.Key, t => t.Value.Length); - ConcurrentBag tasksJobMetadata = []; + private async ValueTask AttachQueuedTesTaskToBatchPoolAsync(string poolKey, TesTask tesTask, VirtualMachineInformation virtualMachineInfo, IList identities, string poolDisplayName, CancellationToken cancellationToken) + { + TaskCompletionSource taskCompletion = new(); // This provides the System.Task this method returns + try { - // Determine how many new pools/jobs we need now - var requiredNewPools = neededPoolNodesByPoolKey.Keys.WhereNot(IsPoolAvailable).ToArray(); + var pool = batchPools.TryGetValue(poolKey, out var set) ? set.LastOrDefault(p => p.IsAvailable) : default; - // Revisit pool/job quotas (the above loop already dealt with the possiblility of needing just one more pool or job). - // This will remove pool keys we cannot accomodate due to quota, along with all of their associated tasks, from being queued into Batch. - if (requiredNewPools.Skip(1).Any()) + if (pool is null) { - bool TryRemoveKeyAndTasks(string key, out (string Key, ImmutableArray ListOfTaskMetadata) result) + TaskCompletionSource poolCompletion = new(); // This provides the poolId of the pool provided for the task + AddTValueToCollectorQueue( + key: poolKey, + value: new PendingPoolRequest(poolKey, virtualMachineInfo, identities, poolDisplayName, poolCompletion), + dictionary: _queuedTesTaskPendingPoolsByKey, + enqueue: (key, tasks) => _queuedTesTaskPendingPoolQuotas.Enqueue(new(key, tasks.First().VirtualMachineInfo, tasks.First().Identities, tasks.First().PoolDisplayName, tasks.Count, tasks.Select(t => t.TaskCompletion))), + groupGatherWindow: QueuedTesTaskPoolGroupGatherWindow, + maxCount: int.MaxValue); + + pool = batchPools.GetPoolOrDefault(await poolCompletion.Task); // This ensures that the pool is managed by this BatchScheduler + + if (pool is null) { - result = default; + throw new System.Diagnostics.UnreachableException("Pool should have been obtained by this point."); + } + } - if (tasksPoolMetadataByPoolKey.TryRemove(key, out var listOfTaskMetadata)) - { - result = (key, listOfTaskMetadata); - return true; - } + var tesTaskLog = tesTask.AddTesTaskLog(); + tesTaskLog.VirtualMachineInfo = virtualMachineInfo; + var cloudTaskId = $"{tesTask.Id}-{tesTask.Logs.Count}"; + tesTask.PoolId = pool.PoolId; + var cloudTask = await ConvertTesTaskToBatchTaskUsingRunnerAsync(cloudTaskId, tesTask, identities.Last(), cancellationToken); + + logger.LogInformation(@"Creating batch task for TES task {TesTaskId}. Using VM size {VmSize}.", tesTask.Id, virtualMachineInfo.VmSize); + + AddTValueToCollectorQueue( + key: pool.PoolId, + value: new(cloudTask, taskCompletion), + dictionary: _queuedTesTaskPendingTasksByJob, + enqueue: (key, tasks) => _queuedTesTaskPendingJobBatches.Enqueue((key, tasks)), + groupGatherWindow: QueuedTesTaskTaskGroupGatherWindow, + maxCount: 100); + } + catch (Exception exception) + { + taskCompletion.SetException(exception); + } - return false; - } + return taskCompletion.Task; + } - var (exceededQuantity, exception) = await quotaVerifier.CheckBatchAccountPoolAndJobQuotasAsync(requiredNewPools.Length, cancellationToken); + /// + /// Adds an entry to the queue in the directory's value for a key in a IDisposable-safe pattern and set that timer after the entry is in the dictionary only once. + /// + /// The type of the key. + /// The type of the value. + /// The key. + /// The value. + /// The grouping dictionary into which to insert into the 's value's queue. + /// The method to transfer the group to the processing queue. + /// The time period within which any additional tasks are added to the same group. + /// The maximum group size. + private static void AddTValueToCollectorQueue(TKey key, TValue value, ConcurrentDictionary> dictionary, Action> enqueue, TimeSpan groupGatherWindow, int maxCount) + { + // Save a list of timers created in this method invocation because ConcurrentDictionary.AddOrUpdate() can call addValueFactory any number of times and never store the result anywhere, resulting in created timers without reachable references + List timers = []; + var entry = dictionary.AddOrUpdate(key: key, + addValueFactory: key => + new(Timer: CreateTimer( + callback: state => QueuedTesTaskAddTaskEntryToQueueFromDirectory( + key: (TKey)state, + dictionary: dictionary, + enqueue: enqueue, + groupGatherWindow: groupGatherWindow, + maxCount: maxCount), + state: key), + Queue: [value]), + updateValueFactory: (key, entry) => new(Timer: entry.Timer, Queue: entry.Queue.Enqueue(value))); + + // If a new entry in the dictionary was created, set that entry's timer to run and don't dispose that timer + if (timers.Remove(entry.Timer)) + { + entry.Timer.Change(groupGatherWindow, Timeout.InfiniteTimeSpan); + } - foreach (var (key, listOfTaskMetadata) in requiredNewPools - .Reverse() // TODO: do we want to favor earlier or later tasks? - .SelectWhere)>(TryRemoveKeyAndTasks) - .Take(exceededQuantity)) - { - foreach (var task in listOfTaskMetadata.Select(m => m.TesTask)) - { - yield return new(HandleExceptionAsync(exception, key, task), task); - } - } - } + // Dispose all remaining timers + timers.ForEach(t => t.Dispose()); - logger.LogDebug(@"Obtaining {PoolQuantity} batch pool identifiers for {QueuedTasks} tasks.", tasksPoolMetadataByPoolKey.Count, tasksPoolMetadataByPoolKey.Values.Sum(l => l.Length)); + Timer CreateTimer(TimerCallback callback, TKey state) + { + Timer timer = new(callback, state, Timeout.Infinite, Timeout.Infinite); + timers.Add(timer); + return timer; + } + } - await Parallel.ForEachAsync(tasksPoolMetadataByPoolKey, cancellationToken, async (pool, token) => - { - var (_, virtualMachineInfo, identities, displayName) = pool.Value.First(); + // Move entries from a ConcurrentDictionary entry (collection queue) to another queue (processing queue) as a single entry + private static void QueuedTesTaskAddTaskEntryToQueueFromDirectory(TKey key, ConcurrentDictionary> dictionary, Action> enqueue, TimeSpan groupGatherWindow, int maxCount) + { + if (!dictionary.TryGetValue(key, out var refValue)) + { + return; // Quick return + } - try - { - var useGen2 = virtualMachineInfo.HyperVGenerations?.Contains("V2", StringComparer.OrdinalIgnoreCase) ?? false; - var poolId = (await GetOrAddPoolAsync( - key: pool.Key, - isPreemptable: virtualMachineInfo.LowPriority, - modelPoolFactory: async (id, ct) => await GetPoolSpecification( - name: id, - displayName: displayName, - poolIdentity: GetBatchPoolIdentity(identities), - vmSize: virtualMachineInfo.VmSize, - vmFamily: virtualMachineInfo.VmFamily, - preemptable: virtualMachineInfo.LowPriority, - initialTarget: neededPoolNodesByPoolKey[pool.Key], - nodeInfo: useGen2 ? gen2BatchNodeInfo : gen1BatchNodeInfo, - encryptionAtHostSupported: virtualMachineInfo.EncryptionAtHostSupported, - cancellationToken: ct), - cancellationToken: token) - ).PoolId; + var (timer, queue) = refValue; + List tasks = []; - tasksJobMetadata.Add(new(pool.Key, poolId, virtualMachineInfo, pool.Value.Select(tuple => tuple.TesTask))); - } - catch (AggregateException aggregateException) - { - var innerExceptions = aggregateException.Flatten().InnerExceptions; + while (!queue.IsEmpty && tasks.Count < maxCount) + { + queue = queue.Dequeue(out var task); + tasks.Add(task); + } - foreach (var tesTask in pool.Value.Select(tuple => tuple.TesTask)) - { - TaskCatchAggregateException(innerExceptions, tesTask, pool.Key); - } - } - catch (Exception exception) - { - foreach (var tesTask in pool.Value.Select(tuple => tuple.TesTask)) - { - TaskCatchException(exception, tesTask, pool.Key); - } - } + enqueue(key, tasks); + + // Remove enqueued entries from directory without leaving empty entries. This is a loop because we are using ConcurrentDirectory + for (; + !(queue.IsEmpty switch + { + true => dictionary.TryRemove(new(key, refValue)), + false => dictionary.TryUpdate(key, new(timer, queue), refValue), }); + queue = ImmutableQueue.CreateRange(refValue.Queue.WhereNot(tasks.Contains))) + { + refValue = dictionary[key]; } - // Return any results that are ready - foreach (var result in results) + if (queue.IsEmpty) { - yield return result; + // Entry was removed from directory + timer.Dispose(); } - - if (tasksJobMetadata.IsEmpty) + else { - yield break; + // Entry was retained in directory + timer.Change(groupGatherWindow, Timeout.InfiniteTimeSpan); } + } - results.Clear(); - - ConcurrentBag tasksMetadata = []; + /// + public async ValueTask PerformBackgroundTasksAsync(CancellationToken cancellationToken) + { + // Add a batch of tasks to a job + if (_queuedTesTaskPendingJobBatches.TryDequeue(out var jobBatch)) + { + var (jobId, tasks) = jobBatch; + logger.LogDebug(@"Adding {AddedTasks} tasks to {CloudJob}.", tasks.Count, jobId); + await PerformTaskAsync( + method: async token => await azureProxy.AddBatchTasksAsync(tasks.Select(t => t.CloudTask), jobId, token), + taskCompletions: tasks.Select(task => task.TaskCompletion), + cancellationToken: cancellationToken); + } - async Task GetCloudTaskAsync(TesTask tesTask, VirtualMachineInformation virtualMachineInfo, string poolKey, string poolId, string acrPullIdentity, CancellationToken cancellationToken) + // Apply Pool and Job Quota limits { - try - { - var tesTaskLog = tesTask.AddTesTaskLog(); - tesTaskLog.VirtualMachineInfo = virtualMachineInfo; - var cloudTaskId = $"{tesTask.Id}-{tesTask.Logs.Count}"; - tesTask.PoolId = poolId; - var cloudTask = await ConvertTesTaskToBatchTaskUsingRunnerAsync(cloudTaskId, tesTask, acrPullIdentity, cancellationToken); - - logger.LogInformation(@"Creating batch task for TES task {TesTaskId}. Using VM size {VmSize}.", tesTask.Id, virtualMachineInfo.VmSize); - return cloudTask; - } - catch (AggregateException aggregateException) - { - TaskCatchAggregateException(aggregateException.Flatten().InnerExceptions, tesTask, poolKey); - } - catch (Exception exception) + Dictionary pools = []; + + while (_queuedTesTaskPendingPoolQuotas.TryDequeue(out var pendingPool)) { - TaskCatchException(exception, tesTask, poolKey); + pools.Add(pendingPool.PoolKey, pendingPool); } - return null; - } - - await Parallel.ForEachAsync( - tasksJobMetadata.Select(metadata => (metadata.JobId, metadata.PoolKey, metadata.Tasks, CloudTasks: metadata.Tasks - .Select(task => new RelatedTask(GetCloudTaskAsync(task, metadata.VirtualMachineInfo, metadata.PoolKey, metadata.JobId, acrPullIdentity, cancellationToken), task)) - .WhenEach(cancellationToken, task => task.Task))), - cancellationToken, - async (metadata, token) => + if (pools.Count != 0) { - var (jobId, poolKey, tasks, relatedCloudTasks) = metadata; + // Determine how many new pools/jobs we need now + var requiredNewPools = pools.Keys.WhereNot(IsPoolAvailable).ToList(); - try + // Revisit pool/job quotas (the task quota analysis already dealt with the possibility of needing just one more pool or job). + if (requiredNewPools.Skip(1).Any()) { - var cloudTasks = (await relatedCloudTasks.ToListAsync(token)).Where(task => task.Task.Result is not null); - await azureProxy.AddBatchTasksAsync(cloudTasks.Select(task => task.Task.Result), jobId, token); + // This will remove pool keys we cannot accommodate due to quota, along with all of their associated tasks, from being queued into Batch. + logger.LogDebug(@"Checking pools and jobs quota to accommodate {NeededPools} additional pools.", requiredNewPools.Count); - tasksMetadata.Add(new(poolKey, cloudTasks.Select(task => task.Related))); - } - catch (AggregateException aggregateException) - { - var innerExceptions = aggregateException.Flatten().InnerExceptions; + var (exceededQuantity, exception) = await quotaVerifier.CheckBatchAccountPoolAndJobQuotasAsync(requiredNewPools.Count, cancellationToken); - foreach (var tesTask in tasks) + foreach (var task in ((IEnumerable)requiredNewPools) + .Reverse() + .SelectWhere(TryRemovePool) + .Take(exceededQuantity) + .SelectMany(t => t.TaskCompletions)) { - TaskCatchAggregateException(innerExceptions, tesTask, poolKey); + task.SetException(exception); } - } - catch (Exception exception) - { - foreach (var tesTask in tasks) + + bool TryRemovePool(string key, out PendingPool result) { - TaskCatchException(exception, tesTask, poolKey); + logger.LogDebug(@"Due to quotas, unable to accommodate {PoolKey} batch pools.", key); + result = pools[key]; + pools.Remove(key); + return true; } } - }); - // Return any results that are ready - foreach (var result in results) - { - yield return result; + logger.LogDebug(@"Obtaining {NewPools} batch pools.", pools.Count); + + foreach (var poolToCreate in pools) + { + _queuedTesTaskPendingPools.Enqueue(poolToCreate.Value); + } + } } - if (tasksMetadata.IsEmpty) + // Create a batch pool + if (_queuedTesTaskPendingPools.TryDequeue(out var pool)) { - yield break; + logger.LogDebug(@"Creating pool for {PoolKey}.", pool.PoolKey); + await PerformTaskOfTAsync( + method: async token => (await GetOrAddPoolAsync( + key: pool.PoolKey, + isPreemptable: pool.VirtualMachineInfo.LowPriority, + modelPoolFactory: async (id, ct) => await GetPoolSpecification( + name: id, + displayName: pool.PoolDisplayName, + poolIdentity: GetBatchPoolIdentity(pool.Identities.WhereNot(string.IsNullOrWhiteSpace).ToList()), + vmSize: pool.VirtualMachineInfo.VmSize, + vmFamily: pool.VirtualMachineInfo.VmFamily, + preemptable: pool.VirtualMachineInfo.LowPriority, + initialTarget: pool.InitialTarget, + nodeInfo: (pool.VirtualMachineInfo.HyperVGenerations?.Contains("V2", StringComparer.OrdinalIgnoreCase) ?? false) ? gen2BatchNodeInfo : gen1BatchNodeInfo, + encryptionAtHostSupported: pool.VirtualMachineInfo.EncryptionAtHostSupported, + cancellationToken: ct), + cancellationToken: token)) + .PoolId, + taskCompletions: pool.TaskCompletions, + cancellationToken: cancellationToken); } - results.Clear(); - - _ = Parallel.ForEach(tasksMetadata.SelectMany(metadata => metadata.Tasks.Select(task => (task, metadata.PoolKey))), metadata => + async static ValueTask PerformTaskAsync(Func method, IEnumerable taskCompletions, CancellationToken cancellationToken) { - var (tesTask, poolKey) = metadata; - try { - var tesTaskLog = tesTask.GetOrAddTesTaskLog(); - tesTaskLog.StartTime = DateTimeOffset.UtcNow; - tesTask.State = TesState.INITIALIZING; - results.Add(new(Task.FromResult(true), tesTask)); - } - catch (AggregateException aggregateException) - { - TaskCatchAggregateException(aggregateException.Flatten().InnerExceptions, tesTask, poolKey); + await method(cancellationToken); + taskCompletions.ForEach(completion => completion.SetResult()); } catch (Exception exception) { - TaskCatchException(exception, tesTask, poolKey); + taskCompletions.ForEach(completion => completion.SetException(new AggregateException(Enumerable.Empty().Append(exception)))); } - }); - - foreach (var result in results) - { - yield return result; } - yield break; - - void TaskCatchException(Exception exception, TesTask tesTask, string poolKey) + async static ValueTask PerformTaskOfTAsync(Func> method, IEnumerable> taskCompletions, CancellationToken cancellationToken) { - results.Add(new(HandleExceptionAsync(exception, poolKey, tesTask), tesTask)); - } - - void TaskCatchAggregateException(IEnumerable innerExceptions, TesTask tesTask, string poolKey) - { - var result = false; - var exceptions = new List(); - - foreach (var partResult in innerExceptions - .Select(ex => HandleExceptionAsync(ex, poolKey, tesTask))) + try { - if (partResult.IsFaulted) - { - exceptions.Add(partResult.Exception); - } - else - { - result |= partResult.Result; - } + var result = await method(cancellationToken); + taskCompletions.ForEach(completion => completion.SetResult(result)); + } + catch (Exception exception) + { + taskCompletions.ForEach(completion => completion.SetException(new AggregateException(Enumerable.Empty().Append(exception)))); } - - results.Add(new(exceptions.Count == 0 - ? Task.FromResult(result) - : Task.FromException(new AggregateException(exceptions)), - tesTask)); } + } - Task HandleExceptionAsync(Exception exception, string poolKey, TesTask tesTask) + Task QueuedTesTaskHandleExceptionAsync(Exception exception, string poolKey, TesTask tesTask) + { + switch (exception) { - switch (exception) - { - case AzureBatchPoolCreationException azureBatchPoolCreationException: - if (!azureBatchPoolCreationException.IsTimeout && !azureBatchPoolCreationException.IsJobQuota && !azureBatchPoolCreationException.IsPoolQuota && azureBatchPoolCreationException.InnerException is not null) - { - return HandleExceptionAsync(azureBatchPoolCreationException.InnerException, poolKey, tesTask); - } - - logger.LogWarning(azureBatchPoolCreationException, "TES task: {TesTask} AzureBatchPoolCreationException.Message: {ExceptionMessage}. This might be a transient issue. Task will remain with state QUEUED. Confirmed timeout: {ConfirmedTimeout}", tesTask.Id, azureBatchPoolCreationException.Message, azureBatchPoolCreationException.IsTimeout); - - if (azureBatchPoolCreationException.IsJobQuota || azureBatchPoolCreationException.IsPoolQuota) - { - neededPools.Add(poolKey); - tesTask.SetWarning(azureBatchPoolCreationException.InnerException switch - { - null => "Unknown reason", - Microsoft.Rest.Azure.CloudException cloudException => cloudException.Body.Message, - var e when e is BatchException batchException && batchException.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException batchErrorException => batchErrorException.Body.Message.Value, - _ => "Unknown reason", - }); - } - - break; + case AzureBatchPoolCreationException azureBatchPoolCreationException: + if (!azureBatchPoolCreationException.IsTimeout && !azureBatchPoolCreationException.IsJobQuota && !azureBatchPoolCreationException.IsPoolQuota && azureBatchPoolCreationException.InnerException is not null) + { + return QueuedTesTaskHandleExceptionAsync(azureBatchPoolCreationException.InnerException, poolKey, tesTask); + } - case AzureBatchQuotaMaxedOutException azureBatchQuotaMaxedOutException: - logger.LogWarning("TES task: {TesTask} AzureBatchQuotaMaxedOutException.Message: {ExceptionMessage}. Not enough quota available. Task will remain with state QUEUED.", tesTask.Id, azureBatchQuotaMaxedOutException.Message); - neededPools.Add(poolKey); - break; - - case AzureBatchLowQuotaException azureBatchLowQuotaException: - tesTask.State = TesState.SYSTEM_ERROR; - tesTask.AddTesTaskLog(); // Adding new log here because this exception is thrown from CheckBatchAccountQuotas() and AddTesTaskLog() above is called after that. This way each attempt will have its own log entry. - tesTask.SetFailureReason("InsufficientBatchQuota", azureBatchLowQuotaException.Message); - logger.LogError(azureBatchLowQuotaException, "TES task: {TesTask} AzureBatchLowQuotaException.Message: {ExceptionMessage}", tesTask.Id, azureBatchLowQuotaException.Message); - break; - - case AzureBatchVirtualMachineAvailabilityException azureBatchVirtualMachineAvailabilityException: - tesTask.State = TesState.SYSTEM_ERROR; - tesTask.AddTesTaskLog(); // Adding new log here because this exception is thrown from GetVmSizeAsync() and AddTesTaskLog() above is called after that. This way each attempt will have its own log entry. - tesTask.SetFailureReason("NoVmSizeAvailable", azureBatchVirtualMachineAvailabilityException.Message); - logger.LogError(azureBatchVirtualMachineAvailabilityException, "TES task: {TesTask} AzureBatchVirtualMachineAvailabilityException.Message: {ExceptionMessage}", tesTask.Id, azureBatchVirtualMachineAvailabilityException.Message); - break; - - case TesException tesException: - tesTask.State = TesState.SYSTEM_ERROR; - tesTask.SetFailureReason(tesException); - logger.LogError(tesException, "TES task: {TesTask} TesException.Message: {ExceptionMessage}", tesTask.Id, tesException.Message); - break; - - case BatchClientException batchClientException: - tesTask.State = TesState.SYSTEM_ERROR; - tesTask.SetFailureReason("BatchClientException", string.Join(",", batchClientException.Data.Values), batchClientException.Message, batchClientException.StackTrace); - logger.LogError(batchClientException, "TES task: {TesTask} BatchClientException.Message: {ExceptionMessage} {ExceptionData}", tesTask.Id, batchClientException.Message, string.Join(",", batchClientException?.Data?.Values)); - break; - - case BatchException batchException when batchException.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException batchErrorException && AzureBatchPoolCreationException.IsJobQuotaException(batchErrorException.Body.Code): - tesTask.SetWarning(batchErrorException.Body.Message.Value, []); - logger.LogInformation("Not enough job quota available for task Id {TesTask}. Reason: {BodyMessage}. Task will remain in queue.", tesTask.Id, batchErrorException.Body.Message.Value); - break; - - case BatchException batchException when batchException.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException batchErrorException && AzureBatchPoolCreationException.IsPoolQuotaException(batchErrorException.Body.Code): - neededPools.Add(poolKey); - tesTask.SetWarning(batchErrorException.Body.Message.Value, []); - logger.LogInformation("Not enough pool quota available for task Id {TesTask}. Reason: {BodyMessage}. Task will remain in queue.", tesTask.Id, batchErrorException.Body.Message.Value); - break; + logger.LogWarning(azureBatchPoolCreationException, "TES task: {TesTask} AzureBatchPoolCreationException.Message: {ExceptionMessage}. This might be a transient issue. Task will remain with state QUEUED. Confirmed timeout: {ConfirmedTimeout}", tesTask.Id, azureBatchPoolCreationException.Message, azureBatchPoolCreationException.IsTimeout); - case Microsoft.Rest.Azure.CloudException cloudException when AzureBatchPoolCreationException.IsPoolQuotaException(cloudException.Body.Code): + if (azureBatchPoolCreationException.IsJobQuota || azureBatchPoolCreationException.IsPoolQuota) + { neededPools.Add(poolKey); - tesTask.SetWarning(cloudException.Body.Message, []); - logger.LogInformation("Not enough pool quota available for task Id {TesTask}. Reason: {BodyMessage}. Task will remain in queue.", tesTask.Id, cloudException.Body.Message); - break; - - default: - tesTask.State = TesState.SYSTEM_ERROR; - tesTask.SetFailureReason(AzureBatchTaskState.UnknownError, $"{exception?.GetType().FullName}: {exception?.Message}", exception?.StackTrace); - logger.LogError(exception, "TES task: {TesTask} Exception: {ExceptionType}: {ExceptionMessage}", tesTask.Id, exception?.GetType().FullName, exception?.Message); - break; - } + tesTask.SetWarning(azureBatchPoolCreationException.InnerException switch + { + null => "Unknown reason", + Azure.RequestFailedException requestFailedException => $"{requestFailedException.ErrorCode}: \"{requestFailedException.Message}\"{requestFailedException.Data.Keys.Cast().Zip(requestFailedException.Data.Values.Cast()).Select(p => $"\n{p.First}: {p.Second}")}", + Microsoft.Rest.Azure.CloudException cloudException => cloudException.Body.Message, + var e when e is BatchException batchException && batchException.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException batchErrorException => batchErrorException.Body.Message.Value, + _ => "Unknown reason", + }); + } - return Task.FromResult(true); + break; + + case AzureBatchQuotaMaxedOutException azureBatchQuotaMaxedOutException: + logger.LogWarning("TES task: {TesTask} AzureBatchQuotaMaxedOutException.Message: {ExceptionMessage}. Not enough quota available. Task will remain with state QUEUED.", tesTask.Id, azureBatchQuotaMaxedOutException.Message); + neededPools.Add(poolKey); + break; + + case AzureBatchLowQuotaException azureBatchLowQuotaException: + tesTask.State = TesState.SYSTEM_ERROR; + tesTask.AddTesTaskLog(); // Adding new log here because this exception is thrown from CheckBatchAccountQuotas() and AddTesTaskLog() above is called after that. This way each attempt will have its own log entry. + tesTask.SetFailureReason("InsufficientBatchQuota", azureBatchLowQuotaException.Message); + logger.LogError(azureBatchLowQuotaException, "TES task: {TesTask} AzureBatchLowQuotaException.Message: {ExceptionMessage}", tesTask.Id, azureBatchLowQuotaException.Message); + break; + + case AzureBatchVirtualMachineAvailabilityException azureBatchVirtualMachineAvailabilityException: + tesTask.State = TesState.SYSTEM_ERROR; + tesTask.AddTesTaskLog(); // Adding new log here because this exception is thrown from GetVmSizeAsync() and AddTesTaskLog() above is called after that. This way each attempt will have its own log entry. + tesTask.SetFailureReason("NoVmSizeAvailable", azureBatchVirtualMachineAvailabilityException.Message); + logger.LogError(azureBatchVirtualMachineAvailabilityException, "TES task: {TesTask} AzureBatchVirtualMachineAvailabilityException.Message: {ExceptionMessage}", tesTask.Id, azureBatchVirtualMachineAvailabilityException.Message); + break; + + case TesException tesException: + tesTask.State = TesState.SYSTEM_ERROR; + tesTask.SetFailureReason(tesException); + logger.LogError(tesException, "TES task: {TesTask} TesException.Message: {ExceptionMessage}", tesTask.Id, tesException.Message); + break; + + case BatchClientException batchClientException: + tesTask.State = TesState.SYSTEM_ERROR; + tesTask.SetFailureReason("BatchClientException", string.Join(",", batchClientException.Data.Values), batchClientException.Message, batchClientException.StackTrace); + logger.LogError(batchClientException, "TES task: {TesTask} BatchClientException.Message: {ExceptionMessage} {ExceptionData}", tesTask.Id, batchClientException.Message, string.Join(",", batchClientException?.Data?.Values)); + break; + + case BatchException batchException when batchException.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException batchErrorException && AzureBatchPoolCreationException.IsJobQuotaException(batchErrorException.Body.Code): + tesTask.SetWarning(batchErrorException.Body.Message.Value, []); + logger.LogInformation("Not enough job quota available for task Id {TesTask}. Reason: {BodyMessage}. Task will remain in queue.", tesTask.Id, batchErrorException.Body.Message.Value); + break; + + case BatchException batchException when batchException.InnerException is Microsoft.Azure.Batch.Protocol.Models.BatchErrorException batchErrorException && AzureBatchPoolCreationException.IsPoolQuotaException(batchErrorException.Body.Code): + neededPools.Add(poolKey); + tesTask.SetWarning(batchErrorException.Body.Message.Value, []); + logger.LogInformation("Not enough pool quota available for task Id {TesTask}. Reason: {BodyMessage}. Task will remain in queue.", tesTask.Id, batchErrorException.Body.Message.Value); + break; + + case Microsoft.Rest.Azure.CloudException cloudException when AzureBatchPoolCreationException.IsPoolQuotaException(cloudException.Body.Code): + neededPools.Add(poolKey); + tesTask.SetWarning(cloudException.Body.Message, []); + logger.LogInformation("Not enough pool quota available for task Id {TesTask}. Reason: {BodyMessage}. Task will remain in queue.", tesTask.Id, cloudException.Body.Message); + break; + + default: + tesTask.State = TesState.SYSTEM_ERROR; + tesTask.SetFailureReason(AzureBatchTaskState.UnknownError, $"{exception?.GetType().FullName}: {exception?.Message}", exception?.StackTrace); + logger.LogError(exception, "TES task: {TesTask} Exception: {ExceptionType}: {ExceptionMessage}", tesTask.Id, exception?.GetType().FullName, exception?.Message); + break; } + + return Task.FromResult(true); } /// diff --git a/src/TesApi.Web/Extensions/EmumerableExtensions.cs b/src/TesApi.Web/Extensions/EnumerableExtensions.cs similarity index 98% rename from src/TesApi.Web/Extensions/EmumerableExtensions.cs rename to src/TesApi.Web/Extensions/EnumerableExtensions.cs index d548b5e78..e00aa65b7 100644 --- a/src/TesApi.Web/Extensions/EmumerableExtensions.cs +++ b/src/TesApi.Web/Extensions/EnumerableExtensions.cs @@ -10,7 +10,7 @@ namespace TesApi.Web.Extensions /// /// extensions /// - public static class EmumerableExtensions + public static class EnumerableExtensions { /// /// Filters a sequence of values based on a false predicate. diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index a66158d24..a6c088974 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -37,7 +37,7 @@ public interface IBatchScheduler Task UploadTaskRunnerIfNeededAsync(CancellationToken cancellationToken); /// - /// Update s with task-related state on a batch system + /// Updates s with task-related state on a batch system /// /// s to schedule on the batch system. /// s corresponding to each . @@ -46,12 +46,12 @@ public interface IBatchScheduler IAsyncEnumerable> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken); /// - /// Schedule queued s on a batch system + /// Schedules a s on a batch system /// - /// s to schedule on the batch system. + /// A to schedule on the batch system. /// A for controlling the lifetime of the asynchronous operation. - /// True for each that needs to be persisted. - IAsyncEnumerable> ProcessQueuedTesTasksAsync(TesTask[] tesTasks, CancellationToken cancellationToken); + /// True to persist the , otherwise False. + Task ProcessQueuedTesTaskAsync(TesTask tesTask, CancellationToken cancellationToken); /// /// Adds to the managed batch pools. @@ -112,6 +112,13 @@ public interface IBatchScheduler /// IAsyncEnumerable GetEventMessagesAsync(CancellationToken cancellationToken, string @event = default); + /// + /// Performs background tasks. + /// + /// A for controlling the lifetime of the asynchronous operation. + /// + ValueTask PerformBackgroundTasksAsync(CancellationToken cancellationToken); + /// /// Identifies an azure cloud task. /// diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 38fefb575..e4448e26b 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -121,7 +121,7 @@ protected async ValueTask ExecuteActionOnIntervalAsync(TimeSpan runInterval, Fun } catch (Exception exc) { - Logger.LogError(exc, "{Message}", exc.Message); + Logger.LogError(exc, "{Exception}: {Message}", exc.GetType().FullName, exc.Message); } } while (await timer.WaitForNextTickAsync(cancellationToken)); @@ -131,152 +131,152 @@ protected async ValueTask ExecuteActionOnIntervalAsync(TimeSpan runInterval, Fun } /// - /// Retrieves provided actionable TES tasks from the database using , performs an action in the batch system using , and updates the resultant state in the repository. + /// Updates the repository with the changes to the TesTask, with exception-based failure reporting. /// /// Tag to disambiguate the state and/or action workflow performed in log messages. - /// Provides array of s on which to perform actions through . - /// Method operating on returning indicating if each needs updating into the repository. + /// . /// A for controlling the lifetime of the asynchronous operation. - /// Tag to indicate the underlying unit quantity of items processed in log messages. - /// A that represents this method's operations. - protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func>> tesTaskGetter, Func>> tesTaskProcessor, CancellationToken cancellationToken, string unitsLabel = "tasks") + /// + protected async ValueTask ProcessOrchestratedTesTaskAsync(string pollName, RelatedTask task, CancellationToken cancellationToken) { - var tesTasks = await (await tesTaskGetter(cancellationToken)).ToArrayAsync(cancellationToken); + var tesTask = task.Related; - if (tesTasks.All(task => task is null)) - { - // Quick return for no tasks - return; - } - - var startTime = DateTime.UtcNow; - - await foreach (var tesTaskTask in tesTaskProcessor(tesTasks, cancellationToken).WithCancellation(cancellationToken)) + try { - var tesTask = tesTaskTask.Related; + var isModified = false; try { - var isModified = false; - - try + isModified = await task; + } + catch (Exception exc) + { + if (++tesTask.ErrorCount > 3 || // TODO: Should we increment this for exceptions here (current behavior) or the attempted executions on the batch? + IsExceptionHttpConflictWhereTaskIsComplete(exc)) { - isModified = await tesTaskTask; + tesTask.State = TesState.SYSTEM_ERROR; + tesTask.EndTime = DateTimeOffset.UtcNow; + tesTask.SetFailureReason(AzureBatchTaskState.UnknownError, exc.Message, exc.StackTrace); } - catch (Exception exc) + + if (exc is Microsoft.Azure.Batch.Common.BatchException batchException) { - if (++tesTask.ErrorCount > 3 || // TODO: Should we increment this for exceptions here (current behavior) or the attempted executions on the batch? - IsExceptionHttpConflictWhereTaskIsComplete(exc)) + var requestInfo = batchException.RequestInformation; + var reason = (batchException.InnerException as Microsoft.Azure.Batch.Protocol.Models.BatchErrorException)?.Response?.ReasonPhrase; + var logs = new List(); + + if (requestInfo?.ServiceRequestId is not null) { - tesTask.State = TesState.SYSTEM_ERROR; - tesTask.EndTime = DateTimeOffset.UtcNow; - tesTask.SetFailureReason(AzureBatchTaskState.UnknownError, exc.Message, exc.StackTrace); + logs.Add($"Azure batch ServiceRequestId: {requestInfo.ServiceRequestId}"); } - if (exc is Microsoft.Azure.Batch.Common.BatchException batchException) + if (requestInfo?.BatchError is not null) { - var requestInfo = batchException.RequestInformation; - var reason = (batchException.InnerException as Microsoft.Azure.Batch.Protocol.Models.BatchErrorException)?.Response?.ReasonPhrase; - var logs = new List(); - - if (requestInfo?.ServiceRequestId is not null) - { - logs.Add($"Azure batch ServiceRequestId: {requestInfo.ServiceRequestId}"); - } + logs.Add($"BatchErrorCode: {requestInfo.BatchError.Code}"); + logs.Add($"BatchErrorMessage ({requestInfo.BatchError.Message.Language}): {requestInfo.BatchError.Message.Value}"); - if (requestInfo?.BatchError is not null) + foreach (var detail in requestInfo.BatchError.Values?.Select(d => $"BatchErrorDetail: '{d.Key}': '{d.Value}'") ?? []) { - logs.Add($"BatchErrorCode: {requestInfo.BatchError.Code}"); - logs.Add($"BatchErrorMessage ({requestInfo.BatchError.Message.Language}): {requestInfo.BatchError.Message.Value}"); - - foreach (var detail in requestInfo.BatchError.Values?.Select(d => $"BatchErrorDetail: '{d.Key}': '{d.Value}'") ?? []) - { - logs.Add(detail); - } + logs.Add(detail); } - - tesTask.AddToSystemLog(logs); } - Logger.LogError(exc, "TES task: {TesTask} threw an exception in OrchestrateTesTasksOnBatch({Poll}).", tesTask.Id, pollName); - await Repository.UpdateItemAsync(tesTask, cancellationToken); + tesTask.AddToSystemLog(logs); } - if (isModified) + if (exc is Azure.RequestFailedException requestFailedException) { - var hasErrored = false; - var hasEnded = false; + var logs = new List(); - switch (tesTask.State) + if (!string.IsNullOrWhiteSpace(requestFailedException.ErrorCode)) { - case TesState.CANCELED: - case TesState.COMPLETE: - hasEnded = true; - break; - - case TesState.EXECUTOR_ERROR: - case TesState.SYSTEM_ERROR: - hasErrored = true; - hasEnded = true; - break; - - default: - break; + logs.Add(requestFailedException.ErrorCode); } - if (hasEnded) + if (!string.IsNullOrWhiteSpace(requestFailedException.Message)) { - tesTask.EndTime = DateTimeOffset.UtcNow; + logs.Add(requestFailedException.Message); } - if (hasErrored) + if (requestFailedException.Data is not null) { - Logger.LogDebug("{TesTask} failed, state: {TesTaskState}, reason: {TesTaskFailureReason}", tesTask.Id, tesTask.State, tesTask.FailureReason); + foreach (var detail in requestFailedException.Data.Keys.Cast().Zip(requestFailedException.Data.Values.Cast()).Select(p => $"RequestFailureDetail '{p.First}': '{p.Second}'")) + { + logs.Add(detail); + } } - - await Repository.UpdateItemAsync(tesTask, cancellationToken); } + + Logger.LogError(exc, "TES task: {TesTask} threw an exception in OrchestrateTesTasksOnBatch({Poll}).", tesTask.Id, pollName); + await Repository.UpdateItemAsync(tesTask, cancellationToken); } - catch (RepositoryCollisionException rce) + + if (isModified) { - Logger.LogError(rce, "RepositoryCollisionException in OrchestrateTesTasksOnBatch({Poll})", pollName); + var hasErrored = false; + var hasEnded = false; - try + switch (tesTask.State) { - var currentTesTask = await rce.Task; - - if (currentTesTask is not null && currentTesTask.IsActiveState()) - { - currentTesTask.SetWarning(rce.Message); + case TesState.CANCELED: + case TesState.COMPLETE: + hasEnded = true; + break; + + case TesState.EXECUTOR_ERROR: + case TesState.SYSTEM_ERROR: + hasErrored = true; + hasEnded = true; + break; + + default: + break; + } - if (currentTesTask.IsActiveState()) - { - // TODO: merge tesTask and currentTesTask - } + if (hasEnded) + { + tesTask.EndTime = DateTimeOffset.UtcNow; + } - await Repository.UpdateItemAsync(currentTesTask, cancellationToken); - } + if (hasErrored) + { + Logger.LogDebug("{TesTask} failed, state: {TesTaskState}, reason: {TesTaskFailureReason}", tesTask.Id, tesTask.State, tesTask.FailureReason); } - catch (Exception exc) + + await Repository.UpdateItemAsync(tesTask, cancellationToken); + } + } + catch (RepositoryCollisionException rce) + { + Logger.LogError(rce, "RepositoryCollisionException in OrchestrateTesTasksOnBatch({Poll})", pollName); + + try + { + var currentTesTask = await rce.Task; + + if (currentTesTask is not null && currentTesTask.IsActiveState()) { - // Consider retrying repository.UpdateItemAsync() if this exception was thrown from 'await rce.Task' - Logger.LogError(exc, "Updating TES Task '{TesTask}' threw {ExceptionType}: '{ExceptionMessage}'. Stack trace: {ExceptionStackTrace}", tesTask.Id, exc.GetType().FullName, exc.Message, exc.StackTrace); + currentTesTask.SetWarning(rce.Message); + + if (currentTesTask.IsActiveState()) + { + // TODO: merge tesTask and currentTesTask + } + + await Repository.UpdateItemAsync(currentTesTask, cancellationToken); } } catch (Exception exc) { + // Consider retrying repository.UpdateItemAsync() if this exception was thrown from 'await rce.Task' Logger.LogError(exc, "Updating TES Task '{TesTask}' threw {ExceptionType}: '{ExceptionMessage}'. Stack trace: {ExceptionStackTrace}", tesTask.Id, exc.GetType().FullName, exc.Message, exc.StackTrace); } } - - if (BatchScheduler.NeedPoolFlush) + catch (Exception exc) { - var pools = (await Repository.GetItemsAsync(task => task.State == TesState.INITIALIZING || task.State == TesState.RUNNING, cancellationToken)).Select(task => task.PoolId).Distinct(); - await BatchScheduler.FlushPoolsAsync(pools, cancellationToken); + Logger.LogError(exc, "Updating TES Task '{TesTask}' threw {ExceptionType}: '{ExceptionMessage}'. Stack trace: {ExceptionStackTrace}", tesTask.Id, exc.GetType().FullName, exc.Message, exc.StackTrace); } - Logger.LogDebug("OrchestrateTesTasksOnBatch({Poll}) for {TaskCount} {UnitsLabel} completed in {TotalSeconds:c}.", pollName, tesTasks.Where(task => task is not null).Count(), unitsLabel, DateTime.UtcNow.Subtract(startTime)); - static bool IsExceptionHttpConflictWhereTaskIsComplete(Exception exc) { if (exc is Microsoft.Azure.Batch.Common.BatchException batchException) @@ -288,5 +288,42 @@ static bool IsExceptionHttpConflictWhereTaskIsComplete(Exception exc) return false; } } + + /// + /// Retrieves provided actionable TES tasks from the database using , performs an action in the batch system using , and updates the resultant state in the repository. + /// + /// Tag to disambiguate the state and/or action workflow performed in log messages. + /// Provides array of s on which to perform actions through . + /// Method operating on returning indicating if each needs updating into the repository. + /// A for controlling the lifetime of the asynchronous operation. + /// Tag to indicate the underlying unit quantity of items processed in log messages. + /// True to process even if there are no tasks processed. + /// A that represents this method's operations. + protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func>> tesTaskGetter, Func>> tesTaskProcessor, CancellationToken cancellationToken, string unitsLabel = "tasks", bool needPoolFlush = false) + { + var tesTasks = await (await tesTaskGetter(cancellationToken)).ToArrayAsync(cancellationToken); + var noTasks = tesTasks.All(task => task is null); + + if (noTasks && !needPoolFlush) + { + // Quick return for no tasks + return; + } + + var startTime = DateTime.UtcNow; + + if (!noTasks) + { + await Parallel.ForEachAsync(tesTaskProcessor(tesTasks, cancellationToken), cancellationToken, (task, token) => ProcessOrchestratedTesTaskAsync(pollName, task, token)); + } + + if (BatchScheduler.NeedPoolFlush) + { + var pools = (await Repository.GetItemsAsync(task => task.State == TesState.INITIALIZING || task.State == TesState.RUNNING, cancellationToken)).Select(task => task.PoolId).Distinct(); + await BatchScheduler.FlushPoolsAsync(pools, cancellationToken); + } + + Logger.LogDebug("OrchestrateTesTasksOnBatch({Poll}) for {TaskCount} {UnitsLabel} completed in {TotalSeconds:c}.", pollName, tesTasks.Where(task => task is not null).Count(), unitsLabel, DateTime.UtcNow.Subtract(startTime)); + } } } diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index 89acd5ed2..c2a0ee7d4 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -17,7 +17,7 @@ namespace TesApi.Web { /// - /// A background service that montitors s in the batch system, orchestrates their lifecycle, and updates their state. + /// A background service that monitors s in the batch system, orchestrates their lifecycle, and updates their state. /// This should only be used as a system-wide singleton service. This class does not support scale-out on multiple machines, /// nor does it implement a leasing mechanism. In the future, consider using the Lease Blob operation. /// diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index 3f5fc0514..4048659cc 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -11,6 +11,7 @@ using Tes.Models; using Tes.Repository; using TesApi.Web.Events; +using TesApi.Web.Extensions; namespace TesApi.Web { @@ -27,8 +28,9 @@ namespace TesApi.Web internal class TaskScheduler(RunnerEventsProcessor nodeEventProcessor, Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger taskSchedulerLogger) : OrchestrateOnBatchSchedulerServiceBase(hostApplicationLifetime, repository, batchScheduler, taskSchedulerLogger) { - private readonly TimeSpan blobRunInterval = TimeSpan.FromSeconds(15); - private readonly TimeSpan batchRunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs appears to use a 30 second polling interval + private static readonly TimeSpan blobRunInterval = TimeSpan.FromSeconds(15); + internal static readonly TimeSpan BatchRunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs appears to use a 30 second polling interval + private static readonly TimeSpan backgroundRunInterval = TimeSpan.FromSeconds(1); private readonly RunnerEventsProcessor nodeEventProcessor = nodeEventProcessor; /// @@ -50,6 +52,7 @@ protected override async ValueTask ExecuteSetupAsync(CancellationToken cancellat protected override async ValueTask ExecuteCoreAsync(CancellationToken cancellationToken) { await Task.WhenAll( + ExecuteBackgroundTasksAsync(cancellationToken), ExecuteCancelledTesTasksOnBatchAsync(cancellationToken), ExecuteQueuedTesTasksOnBatchAsync(cancellationToken), ExecuteUpdateTesTaskFromEventBlobAsync(cancellationToken)); @@ -61,18 +64,35 @@ await Task.WhenAll( /// private async Task ExecuteQueuedTesTasksOnBatchAsync(CancellationToken cancellationToken) { - var query = new Func>>( + var query = new Func>>( async token => (await Repository.GetItemsAsync( predicate: t => t.State == TesState.QUEUED, cancellationToken: token)) - .OrderBy(t => t.CreationTime) - .ToAsyncEnumerable()); + .OrderBy(t => t.CreationTime)); - await ExecuteActionOnIntervalAsync(batchRunInterval, - token => OrchestrateTesTasksOnBatchAsync("Queued", query, BatchScheduler.ProcessQueuedTesTasksAsync, token), + await ExecuteActionOnIntervalAsync(BatchRunInterval, + async cancellation => + { + await Parallel.ForEachAsync( + (await query(cancellation)) + .Select(task => new RelatedTask(BatchScheduler.ProcessQueuedTesTaskAsync(task, cancellation), task)) + .WhenEach(cancellation, task => task.Task), + cancellation, + (task, token) => ProcessOrchestratedTesTaskAsync("Queued", task, token)); + }, cancellationToken); } + /// + /// Retrieves all event blobs from storage and updates the resultant state. + /// + /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. + /// + private async Task ExecuteBackgroundTasksAsync(CancellationToken cancellationToken) + { + await ExecuteActionOnIntervalAsync(backgroundRunInterval, BatchScheduler.PerformBackgroundTasksAsync, cancellationToken); + } + /// /// Retrieves all cancelled TES tasks from the database, performs an action in the batch system, and updates the resultant state /// @@ -87,7 +107,7 @@ private async Task ExecuteCancelledTesTasksOnBatchAsync(CancellationToken cancel .OrderByDescending(t => t.CreationTime) .ToAsyncEnumerable()); - await ExecuteActionOnIntervalAsync(batchRunInterval, + await ExecuteActionOnIntervalAsync(BatchRunInterval, token => OrchestrateTesTasksOnBatchAsync( "Cancelled", query, From 7eb68916e1de1b6a3ba3e7fd0b6ac9b8d1ae2b50 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 27 Sep 2024 18:14:26 -0700 Subject: [PATCH 138/202] Implement multiple executors --- src/CommonUtilities/Models/NodeTask.cs | 22 +++-- .../Docker/VolumeBindingsGeneratorTests.cs | 20 ++-- .../Events/EventsPublisherTests.cs | 28 ++++-- src/Tes.Runner.Test/ExecutorTests.cs | 25 ++--- src/Tes.Runner/Docker/DockerExecutor.cs | 10 +- .../Docker/VolumeBindingsGenerator.cs | 48 ++++----- src/Tes.Runner/Events/EventsPublisher.cs | 28 +++--- src/Tes.Runner/Executor.cs | 21 ++-- src/Tes.RunnerCLI/Commands/CommandFactory.cs | 5 +- src/Tes.RunnerCLI/Commands/CommandHandlers.cs | 30 +++++- src/Tes.RunnerCLI/Commands/CommandLauncher.cs | 9 +- src/Tes/Models/TesExecutor.cs | 23 +++++ src/Tes/Models/TesInput.cs | 12 ++- .../Runner/NodeTaskBuilderTests.cs | 40 ++++---- .../Runner/TaskToNodeTaskConverterTests.cs | 15 ++- .../TaskServiceApiControllerTests.cs | 28 +++--- src/TesApi.Tests/expectedBasicJsonResult.json | 3 +- src/TesApi.Tests/expectedFullJsonResult.json | 3 +- src/TesApi.Web/Controllers/TaskServiceApi.cs | 36 ++++++- src/TesApi.Web/Runner/NodeTaskBuilder.cs | 97 +++++++++++-------- .../Runner/TaskToNodeTaskConverter.cs | 21 ++-- 21 files changed, 330 insertions(+), 194 deletions(-) diff --git a/src/CommonUtilities/Models/NodeTask.cs b/src/CommonUtilities/Models/NodeTask.cs index f83ce6bfb..cb281a3d2 100644 --- a/src/CommonUtilities/Models/NodeTask.cs +++ b/src/CommonUtilities/Models/NodeTask.cs @@ -10,11 +10,10 @@ public class NodeTask { public string? Id { get; set; } public string? WorkflowId { get; set; } - public string? ImageTag { get; set; } - public string? ImageName { get; set; } + public string? MountParentDirectory { get; set; } + public List? Executors { get; set; } public List? ContainerDeviceRequests { get; set; } - public string? ContainerWorkDir { get; set; } - public List? CommandsToExecute { get; set; } + public List? ContainerVolumes { get; set; } public List? Inputs { get; set; } public List? Outputs { get; set; } public string? MetricsFilename { get; set; } @@ -25,6 +24,19 @@ public class NodeTask public RuntimeOptions RuntimeOptions { get; set; } = null!; } + public class Executor + { + public string? ImageTag { get; set; } + public string? ImageName { get; set; } + public string? ContainerWorkDir { get; set; } + public List? CommandsToExecute { get; set; } + public string? ContainerStdIn { get; set; } + public string? ContainerStdOut { get; set; } + public string? ContainerStdErr { get; set; } + public Dictionary? ContainerEnv { get; set; } + public bool IgnoreError { get; set; } + } + public class ContainerDeviceRequest { public string? Driver { get; set; } @@ -42,7 +54,6 @@ public class ContainerDeviceRequest public class FileOutput { public string? Path { get; set; } - public string? MountParentDirectory { get; set; } public string? TargetUrl { get; set; } public TransformationStrategy? TransformationStrategy { get; set; } public FileType? FileType { get; set; } @@ -51,7 +62,6 @@ public class FileOutput public class FileInput { public string? Path { get; set; } - public string? MountParentDirectory { get; set; } public string? SourceUrl { get; set; } public TransformationStrategy? TransformationStrategy { get; set; } } diff --git a/src/Tes.Runner.Test/Docker/VolumeBindingsGeneratorTests.cs b/src/Tes.Runner.Test/Docker/VolumeBindingsGeneratorTests.cs index 8c1a80b0b..01533f5ef 100644 --- a/src/Tes.Runner.Test/Docker/VolumeBindingsGeneratorTests.cs +++ b/src/Tes.Runner.Test/Docker/VolumeBindingsGeneratorTests.cs @@ -11,7 +11,6 @@ namespace Tes.Runner.Test.Docker [TestClass, TestCategory("Unit")] public class VolumeBindingsGeneratorTests { - private VolumeBindingsGenerator volumeBindingsGenerator = null!; private Mock mockFileInfoProvider = null!; [TestInitialize] @@ -19,7 +18,6 @@ public void SetUp() { mockFileInfoProvider = new Mock(); mockFileInfoProvider.Setup(p => p.GetExpandedFileName(It.IsAny())).Returns(p => p); - volumeBindingsGenerator = new VolumeBindingsGenerator(); } [DataTestMethod] @@ -28,7 +26,8 @@ public void SetUp() [DataRow("/wkd/input/file.bam", "/wkd/", "/wkd/input:/input")] public void GenerateVolumeBindings_SingleInputWithWorkingDir_SingleVolumeBinding(string path, string mountParent, string expected) { - var input = new FileInput() { Path = path, MountParentDirectory = mountParent }; + var volumeBindingsGenerator = new VolumeBindingsGenerator(mountParent); + var input = new FileInput() { Path = path }; var bindings = volumeBindingsGenerator.GenerateVolumeBindings(new List() { input }, outputs: default); @@ -40,7 +39,8 @@ public void GenerateVolumeBindings_SingleInputWithWorkingDir_SingleVolumeBinding [DataRow("/wkd/output/file.bam", "/wkd/", "/wkd/output:/output")] public void GenerateVolumeBindings_SingleOutputWithWorkingDir_SingleVolumeBinding(string path, string mountParent, string expected) { - var output = new FileOutput() { Path = path, MountParentDirectory = mountParent }; + var volumeBindingsGenerator = new VolumeBindingsGenerator(mountParent); + var output = new FileOutput() { Path = path }; var bindings = volumeBindingsGenerator.GenerateVolumeBindings(inputs: default, new List() { output }); @@ -53,7 +53,8 @@ public void GenerateVolumeBindings_SingleOutputWithWorkingDir_SingleVolumeBindin [DataRow("/wkd", "/wkd/output:/output", "/wkd/output/file.bam", "/wkd/output/dir1/file1.bam", "/wkd/output/dir2/file1.bam")] public void GenerateVolumeBindings_OutputsWithWorkingDir_SingleVolumeBinding(string mountParent, string expected, params string[] paths) { - var outputs = paths.Select(p => new FileOutput() { Path = p, MountParentDirectory = mountParent }).ToList(); + var volumeBindingsGenerator = new VolumeBindingsGenerator(mountParent); + var outputs = paths.Select(p => new FileOutput() { Path = p }).ToList(); var bindings = volumeBindingsGenerator.GenerateVolumeBindings(inputs: default, outputs); @@ -65,7 +66,8 @@ public void GenerateVolumeBindings_OutputsWithWorkingDir_SingleVolumeBinding(str [DataRow("/wkd", "/wkd/output:/output", "/wkd/out:/out", "/wkd/out/dir1/file1.bam", "/wkd/output/dir2/file1.bam")] public void GenerateVolumeBindings_OutputsWitDifferentParentsAfterWd_TwoVolumeBinding(string mountParent, string expected1, string expected2, params string[] paths) { - var outputs = paths.Select(p => new FileOutput() { Path = p, MountParentDirectory = mountParent }).ToList(); + var volumeBindingsGenerator = new VolumeBindingsGenerator(mountParent); + var outputs = paths.Select(p => new FileOutput() { Path = p }).ToList(); var bindings = volumeBindingsGenerator.GenerateVolumeBindings(inputs: default, outputs); @@ -77,12 +79,12 @@ public void GenerateVolumeBindings_OutputsWitDifferentParentsAfterWd_TwoVolumeBi [TestMethod] public void GenerateVolumeBindings_MultipleInputsAndOutputsWitDifferentParentsAfterWd_TwoVolumeBinding() { - var mountParent = "/wkd"; + var volumeBindingsGenerator = new VolumeBindingsGenerator("/wkd"); var paths = new string[] { "/wkd/outputs/f.bam", "/wkd/outputs/b.bam" }; - var outputs = paths.Select(p => new FileOutput() { Path = p, MountParentDirectory = mountParent }).ToList(); + var outputs = paths.Select(p => new FileOutput() { Path = p }).ToList(); paths = new string[] { "/wkd/inputs/f.bam", "/wkd/inputs/b.bam" }; - var inputs = paths.Select(p => new FileInput() { Path = p, MountParentDirectory = mountParent }).ToList(); + var inputs = paths.Select(p => new FileInput() { Path = p }).ToList(); var bindings = volumeBindingsGenerator.GenerateVolumeBindings(inputs, outputs); diff --git a/src/Tes.Runner.Test/Events/EventsPublisherTests.cs b/src/Tes.Runner.Test/Events/EventsPublisherTests.cs index bb932bffb..1b9e8197c 100644 --- a/src/Tes.Runner.Test/Events/EventsPublisherTests.cs +++ b/src/Tes.Runner.Test/Events/EventsPublisherTests.cs @@ -20,9 +20,15 @@ public void SetUp() { Id = "testId", WorkflowId = "workflowID", - ImageName = "image", - ImageTag = "tag", - CommandsToExecute = ["echo hello"], + Executors = + [ + new() + { + ImageName = "image", + ImageTag = "tag", + CommandsToExecute = ["echo hello"], + } + ], Inputs = [ new() @@ -100,28 +106,30 @@ public async Task PublishDownloadEndEventAsync_EventIsPublished_EventContainsAll [TestMethod] public async Task PublishExecutorStartEventAsync_EventIsPublished_EventContainsAllExpectedData() { - await eventsPublisher.PublishExecutorStartEventAsync(nodeTask); + await eventsPublisher.PublishExecutorStartEventAsync(nodeTask, 0); await eventsPublisher.FlushPublishersAsync(); var eventMessage = ((TestEventSink)sinks[0]).EventsHandled[0]; AssertMessageBaseMapping(eventMessage, EventsPublisher.ExecutorStartEvent, EventsPublisher.StartedStatus); - Assert.AreEqual(nodeTask.ImageName, eventMessage.EventData!["image"]); - Assert.AreEqual(nodeTask.ImageTag, eventMessage.EventData!["imageTag"]); - Assert.AreEqual(nodeTask.CommandsToExecute!.First(), eventMessage.EventData!["commands"]); + Assert.AreEqual("0 of 1", eventMessage.EventData!["executor"]); + Assert.AreEqual(nodeTask.Executors?[0].ImageName, eventMessage.EventData!["image"]); + Assert.AreEqual(nodeTask.Executors?[0].ImageTag, eventMessage.EventData!["imageTag"]); + Assert.AreEqual(nodeTask.Executors?[0].CommandsToExecute?.First(), eventMessage.EventData!["commands"]); } [TestMethod] public async Task PublishExecutorEndEventAsync_EventIsPublished_EventContainsAllExpectedData() { - await eventsPublisher.PublishExecutorEndEventAsync(nodeTask, exitCode: 0, EventsPublisher.SuccessStatus, errorMessage: string.Empty); + await eventsPublisher.PublishExecutorEndEventAsync(nodeTask, 0, exitCode: 0, statusMessage: EventsPublisher.SuccessStatus, errorMessage: string.Empty); await eventsPublisher.FlushPublishersAsync(); var eventMessage = ((TestEventSink)sinks[0]).EventsHandled[0]; AssertMessageBaseMapping(eventMessage, EventsPublisher.ExecutorEndEvent, EventsPublisher.SuccessStatus); - Assert.AreEqual(nodeTask.ImageName, eventMessage.EventData!["image"]); - Assert.AreEqual(nodeTask.ImageTag, eventMessage.EventData!["imageTag"]); + Assert.AreEqual("0 of 1", eventMessage.EventData!["executor"]); + Assert.AreEqual(nodeTask.Executors?[0].ImageName, eventMessage.EventData!["image"]); + Assert.AreEqual(nodeTask.Executors?[0].ImageTag, eventMessage.EventData!["imageTag"]); Assert.AreEqual(0, int.Parse(eventMessage.EventData!["exitCode"])); Assert.AreEqual("", eventMessage.EventData!["errorMessage"]); } diff --git a/src/Tes.Runner.Test/ExecutorTests.cs b/src/Tes.Runner.Test/ExecutorTests.cs index 44a623515..8d0aad07d 100644 --- a/src/Tes.Runner.Test/ExecutorTests.cs +++ b/src/Tes.Runner.Test/ExecutorTests.cs @@ -42,6 +42,8 @@ public void SetUp() nodeTask = new() { + MountParentDirectory = "/root/parent", + Executors = [new()], Outputs = [ new() @@ -56,6 +58,7 @@ public void SetUp() } ] }; + executor = new Executor(nodeTask, fileOperationResolverMock.Object, eventsPublisherMock.Object, transferOperationFactoryMock.Object); } @@ -153,7 +156,7 @@ public async Task ExecuteNodeContainerTaskAsync_SuccessfulExecution_ReturnsConta dockerExecutorMock.Setup(d => d.RunOnContainerAsync(It.IsAny())) .ReturnsAsync(new ContainerExecutionResult("taskId", Error: string.Empty, ExitCode: 0)); - var result = await executor.ExecuteNodeContainerTaskAsync(dockerExecutorMock.Object); + var result = await executor.ExecuteNodeContainerTaskAsync(dockerExecutorMock.Object, 0); Assert.AreEqual(0, result.ContainerResult.ExitCode); Assert.AreEqual(string.Empty, result.ContainerResult.Error); @@ -166,7 +169,7 @@ public async Task ExecuteNodeContainerTaskAsync_ExecutionFails_ReturnsContainerR dockerExecutorMock.Setup(d => d.RunOnContainerAsync(It.IsAny())) .ReturnsAsync(new ContainerExecutionResult("taskId", Error: "Error", ExitCode: 1)); - var result = await executor.ExecuteNodeContainerTaskAsync(dockerExecutorMock.Object); + var result = await executor.ExecuteNodeContainerTaskAsync(dockerExecutorMock.Object, 0); Assert.AreEqual(1, result.ContainerResult.ExitCode); Assert.AreEqual("Error", result.ContainerResult.Error); @@ -178,10 +181,10 @@ public async Task ExecuteNodeContainerTaskAsync_SuccessfulExecution_StartAndSucc dockerExecutorMock.Setup(d => d.RunOnContainerAsync(It.IsAny())) .ReturnsAsync(new ContainerExecutionResult("taskId", Error: string.Empty, ExitCode: 0)); - await executor.ExecuteNodeContainerTaskAsync(dockerExecutorMock.Object); + await executor.ExecuteNodeContainerTaskAsync(dockerExecutorMock.Object, 0); - eventsPublisherMock.Verify(p => p.PublishExecutorStartEventAsync(It.IsAny()), Times.Once); - eventsPublisherMock.Verify(p => p.PublishExecutorEndEventAsync(It.IsAny(), 0, EventsPublisher.SuccessStatus, string.Empty), Times.Once); + eventsPublisherMock.Verify(p => p.PublishExecutorStartEventAsync(It.IsAny(), 0), Times.Once); + eventsPublisherMock.Verify(p => p.PublishExecutorEndEventAsync(It.IsAny(), 0, 0, EventsPublisher.SuccessStatus, string.Empty), Times.Once); } [TestMethod] @@ -190,10 +193,10 @@ public async Task ExecuteNodeContainerTaskAsync_ExecutionFails_StartAndFailureEv dockerExecutorMock.Setup(d => d.RunOnContainerAsync(It.IsAny())) .ReturnsAsync(new ContainerExecutionResult("taskId", Error: "Error", ExitCode: 1)); - await executor.ExecuteNodeContainerTaskAsync(dockerExecutorMock.Object); + await executor.ExecuteNodeContainerTaskAsync(dockerExecutorMock.Object, 0); - eventsPublisherMock.Verify(p => p.PublishExecutorStartEventAsync(It.IsAny()), Times.Once); - eventsPublisherMock.Verify(p => p.PublishExecutorEndEventAsync(It.IsAny(), 1, EventsPublisher.FailedStatus, "Error"), Times.Once); + eventsPublisherMock.Verify(p => p.PublishExecutorStartEventAsync(It.IsAny(), 0), Times.Once); + eventsPublisherMock.Verify(p => p.PublishExecutorEndEventAsync(It.IsAny(), 0, 1, EventsPublisher.FailedStatus, "Error"), Times.Once); } [TestMethod] @@ -202,10 +205,10 @@ public async Task ExecuteNodeContainerTaskAsync_ExecutionThrows_StartAndFailureE dockerExecutorMock.Setup(d => d.RunOnContainerAsync(It.IsAny())) .ThrowsAsync(new Exception("Error")); - await Assert.ThrowsExceptionAsync(async () => await executor.ExecuteNodeContainerTaskAsync(dockerExecutorMock.Object)); + await Assert.ThrowsExceptionAsync(async () => await executor.ExecuteNodeContainerTaskAsync(dockerExecutorMock.Object, 0)); - eventsPublisherMock.Verify(p => p.PublishExecutorStartEventAsync(It.IsAny()), Times.Once); - eventsPublisherMock.Verify(p => p.PublishExecutorEndEventAsync(It.IsAny(), Executor.DefaultErrorExitCode, EventsPublisher.FailedStatus, "Error"), Times.Once); + eventsPublisherMock.Verify(p => p.PublishExecutorStartEventAsync(It.IsAny(), 0), Times.Once); + eventsPublisherMock.Verify(p => p.PublishExecutorEndEventAsync(It.IsAny(), 0, Executor.DefaultErrorExitCode, EventsPublisher.FailedStatus, "Error"), Times.Once); } } } diff --git a/src/Tes.Runner/Docker/DockerExecutor.cs b/src/Tes.Runner/Docker/DockerExecutor.cs index e73314bcd..699224a19 100644 --- a/src/Tes.Runner/Docker/DockerExecutor.cs +++ b/src/Tes.Runner/Docker/DockerExecutor.cs @@ -180,7 +180,7 @@ public virtual async Task RunOnContainerAsync(Executio await ConfigureNetworkAsync(); var createResponse = await CreateContainerAsync(imageWithTag, executionOptions.CommandsToExecute, executionOptions.VolumeBindings, executionOptions.WorkingDir, - executionOptions.ContainerDeviceRequests); + executionOptions.ContainerEnv, executionOptions.ContainerDeviceRequests); _ = await dockerClient.Containers.InspectContainerAsync(createResponse.ID); var logs = await StartContainerWithStreamingOutput(createResponse); @@ -217,7 +217,8 @@ private async Task StreamStdOutAndErrorAsync(string container } private async Task CreateContainerAsync(string imageWithTag, - List commandsToExecute, List? volumeBindings, string? workingDir, List? deviceRequests = default) + List commandsToExecute, List? volumeBindings, string? workingDir, + IDictionary? env = default, List? deviceRequests = default) { logger.LogInformation(@"Creating container with image name: {ImageWithTag}", imageWithTag); @@ -226,6 +227,7 @@ private async Task CreateContainerAsync(string imageWit { Image = imageWithTag, Cmd = commandsToExecute, + Env = env?.Select(pair => $"{pair.Key}={pair.Value}").ToList(), AttachStdout = true, AttachStderr = true, WorkingDir = workingDir, @@ -314,5 +316,7 @@ private async Task BlockDockerContainerAccessToAzureInstanceMetadataService() } public record ExecutionOptions(string? ImageName, string? Tag, List? CommandsToExecute, - List? VolumeBindings, string? WorkingDir, RuntimeOptions RuntimeOptions, List? ContainerDeviceRequests); + List? VolumeBindings, string? WorkingDir, RuntimeOptions RuntimeOptions, + List? ContainerDeviceRequests, Dictionary? ContainerEnv = default, + string? ContainerStdIn = default, string? ContainerStdOut = default, string? ContainerStdErr = default); } diff --git a/src/Tes.Runner/Docker/VolumeBindingsGenerator.cs b/src/Tes.Runner/Docker/VolumeBindingsGenerator.cs index 86a7d5104..455b369dd 100644 --- a/src/Tes.Runner/Docker/VolumeBindingsGenerator.cs +++ b/src/Tes.Runner/Docker/VolumeBindingsGenerator.cs @@ -11,19 +11,22 @@ public class VolumeBindingsGenerator { private readonly ILogger logger = PipelineLoggerFactory.Create(); private readonly IFileInfoProvider fileInfoProvider; + private readonly string mountParentDirectory; - public VolumeBindingsGenerator() : this(new DefaultFileInfoProvider()) + + public VolumeBindingsGenerator(string mountParentDirectory) : this(mountParentDirectory, new DefaultFileInfoProvider()) { } - protected VolumeBindingsGenerator(IFileInfoProvider fileInfoProvider) + protected VolumeBindingsGenerator(string mountParentDirectory, IFileInfoProvider fileInfoProvider) { ArgumentNullException.ThrowIfNull(fileInfoProvider); this.fileInfoProvider = fileInfoProvider; + this.mountParentDirectory = fileInfoProvider.GetExpandedFileName(mountParentDirectory); } - public List GenerateVolumeBindings(List? inputs, List? outputs) + public List GenerateVolumeBindings(List? inputs, List? outputs, List? containerVolumes = default) { var volumeBindings = new HashSet(); @@ -31,7 +34,7 @@ public List GenerateVolumeBindings(List? inputs, List GenerateVolumeBindings(List? inputs, List volumeBindings, string? mountParentDirectory, string path) + private void AddVolumeBindingIfRequired(HashSet volumeBindings, string path) { - var mountPath = ToVolumeBinding(mountParentDirectory, path); + var mountPath = ToVolumeBinding(path); if (!string.IsNullOrEmpty(mountPath)) { @@ -56,30 +65,23 @@ private void AddVolumeBindingIfRequired(HashSet volumeBindings, string? } } - private string? ToVolumeBinding(string? mountParentDirectory, string path) + private string? ToVolumeBinding(string path) { - if (string.IsNullOrEmpty(mountParentDirectory)) - { - logger.LogDebug( - $"The file {path} does not have a mount parent directory defined in the task definition. No volume binding will be created for this file in the container."); - return default; - } - - var expandedMountParentDirectory = fileInfoProvider.GetExpandedFileName(mountParentDirectory); var expandedPath = fileInfoProvider.GetExpandedFileName(path); - if (!expandedPath.StartsWith(expandedMountParentDirectory)) + if (!expandedPath.StartsWith(mountParentDirectory)) { - logger.LogWarning( - $"The expanded path value {expandedPath} does not contain the specified mount parent directory: {expandedMountParentDirectory}. No volume binding will be created for this file in the container."); + logger.LogDebug( + "The expanded path value {ExpandedPath} does not contain the specified mount parent directory: {MountParentDirectory}. No volume binding will be created for this file in the container.", + expandedPath, mountParentDirectory); return default; } - var targetDir = $"{expandedPath.Substring(expandedMountParentDirectory.Length).Split('/', StringSplitOptions.RemoveEmptyEntries)[0].TrimStart('/')}"; + var targetDir = $"{expandedPath[mountParentDirectory.Length..].Split('/', StringSplitOptions.RemoveEmptyEntries)[0].TrimStart('/')}"; - var volBinding = $"{expandedMountParentDirectory.TrimEnd('/')}/{targetDir}:/{targetDir}"; + var volBinding = $"{mountParentDirectory.TrimEnd('/')}/{targetDir}:/{targetDir}"; - logger.LogDebug($"Volume binding for {expandedPath} is {volBinding}"); + logger.LogDebug("Volume binding for {ExpandedPath} is {VolBinding}", expandedPath, volBinding); return volBinding; } diff --git a/src/Tes.Runner/Events/EventsPublisher.cs b/src/Tes.Runner/Events/EventsPublisher.cs index 2269f5b16..af7d007c2 100644 --- a/src/Tes.Runner/Events/EventsPublisher.cs +++ b/src/Tes.Runner/Events/EventsPublisher.cs @@ -104,38 +104,44 @@ public virtual async Task PublishUploadEndEventAsync(NodeTask nodeTask, int numb { { "numberOfFiles", numberOfFiles.ToString()}, { "totalSizeInBytes", totalSizeInBytes.ToString()}, - { "errorMessage", errorMessage??string.Empty} + { "errorMessage", errorMessage ?? string.Empty} }; await PublishAsync(eventMessage); } - public virtual async Task PublishExecutorStartEventAsync(NodeTask nodeTask) + public virtual async Task PublishExecutorStartEventAsync(NodeTask nodeTask, int selector) { var eventMessage = CreateNewEventMessage(nodeTask.Id, ExecutorStartEvent, StartedStatus, nodeTask.WorkflowId); - var commands = nodeTask.CommandsToExecute ?? []; + var executor = nodeTask.Executors?[selector]; + var commands = executor?.CommandsToExecute ?? []; eventMessage.EventData = new() { - { "image", nodeTask.ImageName??string.Empty}, - { "imageTag", nodeTask.ImageTag??string.Empty}, + { "executor", $"{selector} of {nodeTask.Executors?.Count ?? 0}" }, + { "image", executor?.ImageName ?? string.Empty}, + { "imageTag", executor?.ImageTag ?? string.Empty}, { "commands", string.Join(' ', commands) } }; await PublishAsync(eventMessage); } - public virtual async Task PublishExecutorEndEventAsync(NodeTask nodeTask, long exitCode, string statusMessage, string? errorMessage = default) + public virtual async Task PublishExecutorEndEventAsync(NodeTask nodeTask, int selector, long exitCode, string statusMessage, string? errorMessage = default) { var eventMessage = CreateNewEventMessage(nodeTask.Id, ExecutorEndEvent, statusMessage, nodeTask.WorkflowId); + + var executor = nodeTask.Executors?[selector]; + eventMessage.EventData = new() { - { "image", nodeTask.ImageName??string.Empty}, - { "imageTag", nodeTask.ImageTag??string.Empty}, + { "executor", $"{selector} of {nodeTask.Executors?.Count ?? 0}" }, + { "image", executor?.ImageName ?? string.Empty}, + { "imageTag", executor?.ImageTag ?? string.Empty}, { "exitCode", exitCode.ToString()}, - { "errorMessage", errorMessage??string.Empty} + { "errorMessage", errorMessage ?? string.Empty} }; await PublishAsync(eventMessage); } @@ -156,7 +162,7 @@ public virtual async Task PublishDownloadEndEventAsync(NodeTask nodeTask, int nu { { "numberOfFiles", numberOfFiles.ToString()}, { "totalSizeInBytes", totalSizeInBytes.ToString()}, - { "errorMessage", errorMessage??string.Empty} + { "errorMessage", errorMessage ?? string.Empty} }; await PublishAsync(eventMessage); } @@ -176,7 +182,7 @@ public async Task PublishTaskCompletionEventAsync(NodeTask tesNodeTask, TimeSpan eventMessage.EventData = new() { { "duration", duration.ToString()}, - { "errorMessage", errorMessage??string.Empty} + { "errorMessage", errorMessage ?? string.Empty} }; await PublishAsync(eventMessage); diff --git a/src/Tes.Runner/Executor.cs b/src/Tes.Runner/Executor.cs index dc708af8b..2843af8e0 100644 --- a/src/Tes.Runner/Executor.cs +++ b/src/Tes.Runner/Executor.cs @@ -18,7 +18,6 @@ public class Executor : IAsyncDisposable private readonly ILogger logger = PipelineLoggerFactory.Create(); private readonly NodeTask tesNodeTask; private readonly FileOperationResolver operationResolver; - private readonly VolumeBindingsGenerator volumeBindingsGenerator = new(); private readonly EventsPublisher eventsPublisher; private readonly ITransferOperationFactory transferOperationFactory; @@ -46,22 +45,21 @@ public Executor(NodeTask tesNodeTask, FileOperationResolver operationResolver, E this.operationResolver = operationResolver; this.eventsPublisher = eventsPublisher; this.transferOperationFactory = transferOperationFactory; - } - public async Task ExecuteNodeContainerTaskAsync(DockerExecutor dockerExecutor) + public async Task ExecuteNodeContainerTaskAsync(DockerExecutor dockerExecutor, int selector) { try { - await eventsPublisher.PublishExecutorStartEventAsync(tesNodeTask); + await eventsPublisher.PublishExecutorStartEventAsync(tesNodeTask, selector); - var bindings = volumeBindingsGenerator.GenerateVolumeBindings(tesNodeTask.Inputs, tesNodeTask.Outputs); + var bindings = new VolumeBindingsGenerator(tesNodeTask.MountParentDirectory!).GenerateVolumeBindings(tesNodeTask.Inputs, tesNodeTask.Outputs, tesNodeTask.ContainerVolumes); - var executionOptions = CreateExecutionOptions(bindings); + var executionOptions = CreateExecutionOptions(tesNodeTask.Executors![selector], bindings); var result = await dockerExecutor.RunOnContainerAsync(executionOptions); - await eventsPublisher.PublishExecutorEndEventAsync(tesNodeTask, result.ExitCode, ToStatusMessage(result), result.Error); + await eventsPublisher.PublishExecutorEndEventAsync(tesNodeTask, selector, result.ExitCode, ToStatusMessage(result), result.Error); return new NodeTaskResult(result); } @@ -69,16 +67,17 @@ public async Task ExecuteNodeContainerTaskAsync(DockerExecutor d { logger.LogError(e, "Failed to execute container"); - await eventsPublisher.PublishExecutorEndEventAsync(tesNodeTask, DefaultErrorExitCode, EventsPublisher.FailedStatus, e.Message); + await eventsPublisher.PublishExecutorEndEventAsync(tesNodeTask, selector, DefaultErrorExitCode, EventsPublisher.FailedStatus, e.Message); throw; } } - private ExecutionOptions CreateExecutionOptions(List bindings) + private ExecutionOptions CreateExecutionOptions(Models.Executor executor, List bindings) { - return new(tesNodeTask.ImageName, tesNodeTask.ImageTag, tesNodeTask.CommandsToExecute, bindings, - tesNodeTask.ContainerWorkDir, tesNodeTask.RuntimeOptions, tesNodeTask.ContainerDeviceRequests); + return new(executor.ImageName, executor.ImageTag, executor.CommandsToExecute, bindings, + executor.ContainerWorkDir, tesNodeTask.RuntimeOptions, tesNodeTask.ContainerDeviceRequests, + executor.ContainerEnv, executor.ContainerStdIn, executor.ContainerStdOut, executor.ContainerStdErr); } private static string ToStatusMessage(ContainerExecutionResult result) diff --git a/src/Tes.RunnerCLI/Commands/CommandFactory.cs b/src/Tes.RunnerCLI/Commands/CommandFactory.cs index 4941e7ad0..45e7c0485 100644 --- a/src/Tes.RunnerCLI/Commands/CommandFactory.cs +++ b/src/Tes.RunnerCLI/Commands/CommandFactory.cs @@ -18,6 +18,7 @@ internal static class CommandFactory internal const string DownloadCommandName = "download"; internal const string ExecutorCommandName = "exec"; internal const string DockerUriOption = "docker-url"; + internal const string ExecutorSelectorOption = "executor"; private static readonly IReadOnlyCollection + /// Indicate that a file resource could be accessed using a streaming + /// interface, ie a FUSE mounted s3 object. This flag indicates that + /// using a streaming mount, as opposed to downloading the whole file to + /// the local scratch space, may be faster despite the latency and + /// overhead. This does not mean that the backend will use a streaming + /// interface, as it may not be provided by the vendor, but if the + /// capacity is available it can be used without degrading the + /// performance of the underlying program. + /// [DataMember(Name = "streamable")] public bool Streamable { get; set; } diff --git a/src/TesApi.Tests/Runner/NodeTaskBuilderTests.cs b/src/TesApi.Tests/Runner/NodeTaskBuilderTests.cs index a415002d5..2dbd67286 100644 --- a/src/TesApi.Tests/Runner/NodeTaskBuilderTests.cs +++ b/src/TesApi.Tests/Runner/NodeTaskBuilderTests.cs @@ -37,18 +37,18 @@ public void WithWorkflowId_ValidId_IdIsSet() } [TestMethod] - public void WithContainerWorkingDirectory_WorkingDirIsProvided_WorkingDirIsSet() + public void WithMountParentDirectory_MountParentDirIsProvided_MountParentDirIsSet() { - var workingDir = "/home"; - nodeTaskBuilder.WithContainerWorkingDirectory(workingDir); - Assert.AreEqual(workingDir, nodeTaskBuilder.Build().ContainerWorkDir); + var workingDir = "/root/task"; + nodeTaskBuilder.WithMountParentDirectory(workingDir); + Assert.AreEqual(workingDir, nodeTaskBuilder.Build().MountParentDirectory); } [TestMethod] public void WithInputUsingCombinedTransformationStrategy_WithTerraRuntimeSet_InputUsesTerraCombinedTransformationStrategy() { nodeTaskBuilder.WithTerraAsRuntimeEnvironment("https://wsm.foo", "https://lz.foo", sasAllowedIpRange: String.Empty); - nodeTaskBuilder.WithInputUsingCombinedTransformationStrategy("/root/input", "http://foo.bar/input", "/root"); + nodeTaskBuilder.WithInputUsingCombinedTransformationStrategy("/root/input", "http://foo.bar/input"); var input = nodeTaskBuilder.Build().Inputs![0]; Assert.AreEqual(TransformationStrategy.CombinedTerra, input.TransformationStrategy); @@ -58,7 +58,7 @@ public void WithInputUsingCombinedTransformationStrategy_WithTerraRuntimeSet_Inp public void WithInputUsingCombinedTransformationStrategy_WithTerraRuntimeNotSet_InputUseCombinedARMTransformationStrategy() { - nodeTaskBuilder.WithInputUsingCombinedTransformationStrategy("/root/input", "http://foo.bar/input", "/root"); + nodeTaskBuilder.WithInputUsingCombinedTransformationStrategy("/root/input", "http://foo.bar/input"); var input = nodeTaskBuilder.Build().Inputs![0]; Assert.AreEqual(TransformationStrategy.CombinedAzureResourceManager, input.TransformationStrategy); } @@ -66,39 +66,36 @@ public void [TestMethod] public void WithInputUsingCombineTransformationStrategy_ValidInput_AllPropertiesSet() { - nodeTaskBuilder.WithInputUsingCombinedTransformationStrategy("/root/input", "http://foo.bar/input", "/root"); + nodeTaskBuilder.WithInputUsingCombinedTransformationStrategy("/root/input", "http://foo.bar/input"); var input = nodeTaskBuilder.Build().Inputs![0]; Assert.AreEqual("/root/input", input.Path); Assert.AreEqual("http://foo.bar/input", input.SourceUrl); - Assert.AreEqual("/root", input.MountParentDirectory); } [TestMethod] public void WhenInputContainsUriQuery_ValidInput_AllPropertiesSet() { - nodeTaskBuilder.WithInputUsingCombinedTransformationStrategy("/root/input", "http://foo.bar/input?test", "/root"); + nodeTaskBuilder.WithInputUsingCombinedTransformationStrategy("/root/input", "http://foo.bar/input?test"); var input = nodeTaskBuilder.Build().Inputs![0]; Assert.AreEqual("/root/input", input.Path); Assert.AreEqual("http://foo.bar/input?test", input.SourceUrl); - Assert.AreEqual("/root", input.MountParentDirectory); Assert.AreEqual(TransformationStrategy.None, input.TransformationStrategy); } [TestMethod] public void WhenInputPathContainsUriQuery_ValidInput_AllPropertiesSet() { - nodeTaskBuilder.WithInputUsingCombinedTransformationStrategy("/root/input?test", "http://foo.bar/input", "/root"); + nodeTaskBuilder.WithInputUsingCombinedTransformationStrategy("/root/input?test", "http://foo.bar/input"); var input = nodeTaskBuilder.Build().Inputs![0]; Assert.AreEqual("/root/input", input.Path); Assert.AreEqual("http://foo.bar/input", input.SourceUrl); - Assert.AreEqual("/root", input.MountParentDirectory); } [TestMethod] public void WithOutputUsingCombinedTransformationStrategy_WithTerraRuntimeSet_OutputUsesCombinedTerraTransformationStrategy() { nodeTaskBuilder.WithTerraAsRuntimeEnvironment("https://wsm.foo", "https://lz.foo", sasAllowedIpRange: String.Empty); - nodeTaskBuilder.WithOutputUsingCombinedTransformationStrategy("/root/output", "http://foo.bar/output", FileType.File, "/root"); + nodeTaskBuilder.WithOutputUsingCombinedTransformationStrategy("/root/output", "http://foo.bar/output", FileType.File); var output = nodeTaskBuilder.Build().Outputs![0]; Assert.AreEqual(TransformationStrategy.CombinedTerra, output.TransformationStrategy); } @@ -106,19 +103,18 @@ public void WithOutputUsingCombinedTransformationStrategy_WithTerraRuntimeSet_Ou [TestMethod] public void WithOutputUsingCombineTransformationStrategy_ValidOutput_AllPropertiesSet() { - nodeTaskBuilder.WithOutputUsingCombinedTransformationStrategy("/root/output", "http://foo.bar/output", FileType.File, "/root"); + nodeTaskBuilder.WithOutputUsingCombinedTransformationStrategy("/root/output", "http://foo.bar/output", FileType.File); var output = nodeTaskBuilder.Build().Outputs![0]; Assert.AreEqual("/root/output", output.Path); Assert.AreEqual("http://foo.bar/output", output.TargetUrl); Assert.AreEqual(FileType.File, output.FileType); - Assert.AreEqual("/root", output.MountParentDirectory); } [TestMethod] public void WithOutputUsingCombinedTransformationStrategy_WithTerraRuntimeNotSet_OutputUsesCombinedARMTransformationStrategy() { - nodeTaskBuilder.WithOutputUsingCombinedTransformationStrategy("/root/output", "http://foo.bar/output", FileType.File, "/root"); + nodeTaskBuilder.WithOutputUsingCombinedTransformationStrategy("/root/output", "http://foo.bar/output", FileType.File); var output = nodeTaskBuilder.Build().Outputs![0]; Assert.AreEqual(TransformationStrategy.CombinedAzureResourceManager, output.TransformationStrategy); } @@ -127,8 +123,9 @@ public void public void WithContainerCommands_CommandsProvided_CommandsAreSet() { var commands = new List() { "echo", "world" }; - nodeTaskBuilder.WithContainerCommands(commands); - var containerInfo = nodeTaskBuilder.Build().CommandsToExecute; + Tes.Models.TesExecutor executor = new() { Command = commands, Image = "image" }; + nodeTaskBuilder.WithExecutors([executor]); + var containerInfo = nodeTaskBuilder.Build().Executors[0].CommandsToExecute; CollectionAssert.AreEqual(commands, containerInfo); } @@ -139,10 +136,11 @@ public void WithContainerCommands_CommandsProvided_CommandsAreSet() [DataRow("broadinstitute/gatk@sha256:f80d33060cb4872d29b9a248b193d267f838b1a636c5a6120aaa45b08a1f09e9", "broadinstitute/gatk", "sha256:f80d33060cb4872d29b9a248b193d267f838b1a636c5a6120aaa45b08a1f09e9")] public void WithContainerImageTest_ImageInfoIsProvided_ImageInfoIsSet(string imageInfo, string expectedImage, string expectedTag) { - nodeTaskBuilder.WithContainerImage(imageInfo); + Tes.Models.TesExecutor executor = new() { Image = imageInfo, Command = ["cmd"] }; + nodeTaskBuilder.WithExecutors([executor]); var nodeTask = nodeTaskBuilder.Build(); - Assert.AreEqual(expectedImage, nodeTask.ImageName); - Assert.AreEqual(expectedTag, nodeTask.ImageTag); + Assert.AreEqual(expectedImage, nodeTask.Executors[0].ImageName); + Assert.AreEqual(expectedTag, nodeTask.Executors[0].ImageTag); } [TestMethod] diff --git a/src/TesApi.Tests/Runner/TaskToNodeTaskConverterTests.cs b/src/TesApi.Tests/Runner/TaskToNodeTaskConverterTests.cs index db5baf5e8..4f90c8609 100644 --- a/src/TesApi.Tests/Runner/TaskToNodeTaskConverterTests.cs +++ b/src/TesApi.Tests/Runner/TaskToNodeTaskConverterTests.cs @@ -159,7 +159,6 @@ public async Task Assert.IsNotNull(nodeTask); Assert.IsNull(nodeTask.Inputs); Assert.AreEqual(3, nodeTask.Outputs.Count); - Assert.AreEqual(3, nodeTask.Outputs.Count(output => output.MountParentDirectory is null)); } [TestMethod] @@ -175,10 +174,8 @@ public async Task Assert.IsTrue(input.Path!.StartsWith(TaskToNodeTaskConverter.BatchTaskWorkingDirEnvVar)); } - foreach (var output in nodeTask.Outputs!.Where(output => output.MountParentDirectory is not null)) - { - Assert.IsTrue(output.Path!.StartsWith(TaskToNodeTaskConverter.BatchTaskWorkingDirEnvVar)); - } + Assert.AreEqual(5, nodeTask.Outputs?.Count(output => output.Path!.StartsWith(TaskToNodeTaskConverter.BatchTaskWorkingDirEnvVar))); + Assert.AreEqual(3, nodeTask.Outputs?.Count(output => output.Path!.StartsWith($"%{NodeTaskBuilder.BatchTaskDirEnvVarName}%"))); } [TestMethod] @@ -250,7 +247,7 @@ public async Task ToNodeTaskAsync_TesTaskWithNoInputsAndOutputs_NodeTaskContains var nodeTask = await taskToNodeTaskConverter.ToNodeTaskAsync(tesTask, options, CancellationToken.None); Assert.IsNotNull(nodeTask); Assert.IsNull(nodeTask.Inputs); - Assert.IsFalse(nodeTask.Outputs!.Any(output => output.MountParentDirectory is not null)); + Assert.AreEqual(0, nodeTask.Outputs?.Count(output => output.Path!.StartsWith(TaskToNodeTaskConverter.BatchTaskWorkingDirEnvVar))); } [TestMethod] @@ -264,7 +261,7 @@ public async Task Assert.IsNotNull(nodeTask); Assert.AreEqual(2, nodeTask.Inputs!.Count); - Assert.IsFalse(nodeTask.Outputs!.Any(output => output.MountParentDirectory is not null)); + Assert.AreEqual(0, nodeTask.Outputs?.Count(output => output.Path!.StartsWith(TaskToNodeTaskConverter.BatchTaskWorkingDirEnvVar))); } [TestMethod] @@ -277,7 +274,7 @@ public async Task var nodeTask = await taskToNodeTaskConverter.ToNodeTaskAsync(tesTask, options, CancellationToken.None); Assert.IsNotNull(nodeTask); Assert.IsNull(nodeTask.Inputs); - Assert.IsFalse(nodeTask.Outputs!.Any(output => output.MountParentDirectory is not null)); + Assert.AreEqual(0, nodeTask.Outputs?.Count(output => output.Path!.StartsWith(TaskToNodeTaskConverter.BatchTaskWorkingDirEnvVar))); } [TestMethod] @@ -301,7 +298,7 @@ public async Task var nodeTask = await taskToNodeTaskConverter.ToNodeTaskAsync(tesTask, options, CancellationToken.None); Assert.IsNotNull(nodeTask); Assert.IsNull(nodeTask.Inputs); - Assert.IsFalse(nodeTask.Outputs!.Any(output => output.MountParentDirectory is not null)); + Assert.AreEqual(0, nodeTask.Outputs?.Count(output => output.Path!.StartsWith(TaskToNodeTaskConverter.BatchTaskWorkingDirEnvVar))); } [TestMethod] diff --git a/src/TesApi.Tests/TaskServiceApiControllerTests.cs b/src/TesApi.Tests/TaskServiceApiControllerTests.cs index 59519fb5b..e29fa5cff 100644 --- a/src/TesApi.Tests/TaskServiceApiControllerTests.cs +++ b/src/TesApi.Tests/TaskServiceApiControllerTests.cs @@ -36,7 +36,7 @@ public async Task TES_Supports_BackendParameter_vmsize() var tesTask = new TesTask { - Executors = [new() { Image = "ubuntu" }], + Executors = [new() { Image = "image", Command = ["cmd"] }], Resources = new() { BackendParameters = backendParameters, BackendParametersStrict = true } }; @@ -66,7 +66,7 @@ public async Task TES_Supports_BackendParameter_workflow_execution_identity() var tesTask = new TesTask { - Executors = [new() { Image = "ubuntu" }], + Executors = [new() { Image = "image", Command = ["cmd"] }], Resources = new() { BackendParameters = backendParameters, BackendParametersStrict = true } }; @@ -96,7 +96,7 @@ public async Task CreateTaskAsync_ReturnsTesCreateTaskResponseWithBackendParamet var tesTask = new TesTask { - Executors = [new() { Image = "ubuntu" }], + Executors = [new() { Image = "image", Command = ["cmd"] }], Resources = new() { BackendParameters = backendParameters } }; @@ -128,7 +128,7 @@ public async Task CreateTaskAsync_ReturnsBadRequest_ForBackendParametersStrict_U var tesTask = new TesTask { - Executors = [new() { Image = "ubuntu" }], + Executors = [new() { Image = "image", Command = ["cmd"] }], Resources = new() { BackendParameters = backendParameters, BackendParametersStrict = true } }; @@ -160,7 +160,7 @@ public async Task CreateTaskAsync_ReturnsBadRequest_ForBackendParametersStrict_D var tesTask = new TesTask { - Executors = [new() { Image = "ubuntu" }], + Executors = [new() { Image = "image", Command = ["cmd"] }], Resources = new() { BackendParameters = backendParameters, BackendParametersStrict = true } }; @@ -177,7 +177,7 @@ public async Task CreateTaskAsync_ReturnsBadRequest_ForBackendParametersStrict_D [TestMethod] public async Task CreateTaskAsync_ReturnsBadRequest_ForInvalidId() { - var tesTask = new TesTask { Id = "ClientProvidedId", Executors = [new() { Image = "ubuntu" }] }; + var tesTask = new TesTask { Id = "ClientProvidedId", Executors = [new() { Image = "image", Command = ["cmd"] }] }; using var services = new TestServices.TestServiceProvider(); var controller = services.GetT(); @@ -190,7 +190,7 @@ public async Task CreateTaskAsync_ReturnsBadRequest_ForInvalidId() [TestMethod] public async Task CreateTaskAsync_ReturnsBadRequest_ForMissingDockerImage() { - TesTask tesTask = new() { Executors = [new()] }; + TesTask tesTask = new() { Executors = [new() { Command = ["cmd"] }] }; using var services = new TestServices.TestServiceProvider(); var controller = services.GetT(); @@ -203,7 +203,7 @@ public async Task CreateTaskAsync_ReturnsBadRequest_ForMissingDockerImage() [TestMethod] public async Task CreateTaskAsync_ReturnsBadRequest_ForRelativeInputPath() { - TesTask tesTask = new() { Inputs = [new() { Path = "xyz/path" }] }; + TesTask tesTask = new() { Inputs = [new() { Path = "xyz/path" }], Executors = [new() { Image = "image", Command = ["cmd"] }] }; using var services = new TestServices.TestServiceProvider(); var controller = services.GetT(); @@ -216,7 +216,7 @@ public async Task CreateTaskAsync_ReturnsBadRequest_ForRelativeInputPath() [TestMethod] public async Task CreateTaskAsync_ReturnsBadRequest_ForInputMissingContentAndPath() { - TesTask tesTask = new() { Inputs = [new() { Url = "http://host/path" }] }; + TesTask tesTask = new() { Inputs = [new() { Url = "http://host/path" }], Executors = [new() { Image = "image", Command = ["cmd"] }] }; using var services = new TestServices.TestServiceProvider(); var controller = services.GetT(); @@ -229,7 +229,7 @@ public async Task CreateTaskAsync_ReturnsBadRequest_ForInputMissingContentAndPat [TestMethod] public async Task CreateTaskAsync_ReturnsBadRequest_ForInputContentAndPath() { - TesTask tesTask = new() { Inputs = [new() { Url = "http://host/path", Path = "/path/file", Content = "content" }] }; + TesTask tesTask = new() { Inputs = [new() { Url = "http://host/path", Path = "/path/file", Content = "content" }], Executors = [new() { Image = "image", Command = ["cmd"] }] }; using var services = new TestServices.TestServiceProvider(); var controller = services.GetT(); @@ -242,7 +242,7 @@ public async Task CreateTaskAsync_ReturnsBadRequest_ForInputContentAndPath() [TestMethod] public async Task CreateTaskAsync_ReturnsTesCreateTaskResponse() { - var tesTask = new TesTask() { Executors = [new() { Image = "ubuntu" }] }; + var tesTask = new TesTask() { Executors = [new() { Image = "image", Command = ["cmd"] }] }; using var services = new TestServices.TestServiceProvider(); var controller = services.GetT(); @@ -743,7 +743,7 @@ public async Task CreateTaskAsync_ExtractsCromwellWorkflowId() var tesTask = new TesTask() { Description = taskDescription, - Executors = [new() { Image = "ubuntu" }], + Executors = [new() { Image = "image", Command = ["cmd"] }], Inputs = [new() { Type = TesFileType.FILE, Description = "BackendJobDescriptorKey_CommandCallNode_wf_hello.hello.commandScript", Name = "commandScript", Path = $"/cromwell-executions/test/{cromwellWorkflowId}/call-hello/test-subworkflow/{cromwellSubWorkflowId}/call-subworkflow/shard-8/execution/script", Content = "command" }], Outputs = [ @@ -772,7 +772,7 @@ public async Task CreateTaskAsync_CromwellWorkflowIdIsUsedAsTaskIdPrefix() var tesTask = new TesTask() { Description = taskDescription, - Executors = [new() { Image = "ubuntu" }], + Executors = [new() { Image = "image", Command = ["cmd"] }], Inputs = [new() { Type = TesFileType.FILE, Description = "BackendJobDescriptorKey_CommandCallNode_wf_hello.hello.commandScript", Name = "commandScript", Path = $"/cromwell-executions/test/{cromwellWorkflowId}/call-hello/test-subworkflow/{cromwellSubWorkflowId}/call-subworkflow/shard-8/execution/script", Content = "command" }], Outputs = [ @@ -801,7 +801,7 @@ public async Task CreateTaskAsync_CromwellMetadataForTriggerServiceIsGenerated(s var tesTask = new TesTask() { Description = taskDescription, - Executors = [new() { Image = "ubuntu" }], + Executors = [new() { Image = "image", Command = ["cmd"] }], Inputs = [new() { Type = TesFileType.FILE, Description = "BackendJobDescriptorKey_CommandCallNode_wf_hello.hello.commandScript", Name = "commandScript", Path = $"{path}/script", Url = $"{path}/script" }], Outputs = [ diff --git a/src/TesApi.Tests/expectedBasicJsonResult.json b/src/TesApi.Tests/expectedBasicJsonResult.json index 50c27a448..83755abbf 100644 --- a/src/TesApi.Tests/expectedBasicJsonResult.json +++ b/src/TesApi.Tests/expectedBasicJsonResult.json @@ -47,7 +47,8 @@ "additionalProp1": "string", "additionalProp2": "string", "additionalProp3": "string" - } + }, + "ignore_error": null } ], "volumes": [ diff --git a/src/TesApi.Tests/expectedFullJsonResult.json b/src/TesApi.Tests/expectedFullJsonResult.json index b77673177..d6cc2786c 100644 --- a/src/TesApi.Tests/expectedFullJsonResult.json +++ b/src/TesApi.Tests/expectedFullJsonResult.json @@ -48,7 +48,8 @@ "additionalProp1": "string", "additionalProp2": "string", "additionalProp3": "string" - } + }, + "ignore_error": null } ], "volumes": [ diff --git a/src/TesApi.Web/Controllers/TaskServiceApi.cs b/src/TesApi.Web/Controllers/TaskServiceApi.cs index 2069fd94c..be8219ccb 100644 --- a/src/TesApi.Web/Controllers/TaskServiceApi.cs +++ b/src/TesApi.Web/Controllers/TaskServiceApi.cs @@ -129,12 +129,40 @@ public virtual async Task CreateTaskAsync([FromBody] TesTask tesT return BadRequest("Id should not be included by the client in the request; the server is responsible for generating a unique Id."); } - if ((tesTask.Executors ?? []).Select(executor => executor.Image).Any(string.IsNullOrWhiteSpace)) + if (!(tesTask.Executors ?? []).Any()) { - return BadRequest("Docker container image name is required."); + return BadRequest("At least one executor is required."); } - foreach (var input in tesTask.Inputs ?? []) + foreach (var executor in (tesTask.Executors ?? [])) + { + if (string.IsNullOrWhiteSpace(executor.Image)) + { + return BadRequest("Docker container image name is required."); + } + + if (!(executor.Command ?? []).Any()) + { + return BadRequest("Executor command is required."); + } + + if (executor.Stdin is not null && !executor.Stdin.StartsWith('/')) + { + return BadRequest("Standard in must be an absolute path in the container."); + } + + if (executor.Stdout is not null && !executor.Stdout.StartsWith('/')) + { + return BadRequest("Standard out must be an absolute path in the container."); + } + + if (executor.Stderr is not null && !executor.Stderr.StartsWith('/')) + { + return BadRequest("Standard error must be an absolute path in the container."); + } + } + + foreach (var input in (tesTask.Inputs ?? [])) { if (string.IsNullOrWhiteSpace(input.Path) || !input.Path.StartsWith('/')) { @@ -162,7 +190,7 @@ public virtual async Task CreateTaskAsync([FromBody] TesTask tesT } } - foreach (var output in tesTask.Outputs ?? Enumerable.Empty()) + foreach (var output in (tesTask.Outputs ?? [])) { if (string.IsNullOrWhiteSpace(output.Path) || !output.Path.StartsWith('/')) { diff --git a/src/TesApi.Web/Runner/NodeTaskBuilder.cs b/src/TesApi.Web/Runner/NodeTaskBuilder.cs index fcef04cc5..bf6540538 100644 --- a/src/TesApi.Web/Runner/NodeTaskBuilder.cs +++ b/src/TesApi.Web/Runner/NodeTaskBuilder.cs @@ -3,6 +3,7 @@ using System; using System.Collections.Generic; +using System.Linq; using System.Text.RegularExpressions; using CommonUtilities; using Tes.Runner.Models; @@ -71,15 +72,31 @@ public NodeTaskBuilder WithWorkflowId(string workflowId) } /// - /// Sets the container working directory of the NodeTask + /// Sets container volumes /// - /// + /// /// - public NodeTaskBuilder WithContainerWorkingDirectory(string workingDirectory) + public NodeTaskBuilder WithContainerVolumes(List volumes) { - ArgumentException.ThrowIfNullOrEmpty(workingDirectory, nameof(workingDirectory)); + nodeTask.ContainerVolumes = volumes; + return this; + } + + /// + /// Sets the executors + /// + /// + /// + public NodeTaskBuilder WithExecutors(List executors) + { + ArgumentNullException.ThrowIfNull(executors); - nodeTask.ContainerWorkDir = workingDirectory; + if (executors.Count == 0) + { + throw new ArgumentException("The list executors can't be empty.", nameof(executors)); + } + + nodeTask.Executors = [.. executors.Select(ConvertExecutor)]; return this; } @@ -90,9 +107,8 @@ public NodeTaskBuilder WithContainerWorkingDirectory(string workingDirectory) /// /// /// - /// /// - public NodeTaskBuilder WithInputUsingCombinedTransformationStrategy(string path, string sourceUrl, string mountParentDirectory) + public NodeTaskBuilder WithInputUsingCombinedTransformationStrategy(string path, string sourceUrl) { ArgumentException.ThrowIfNullOrEmpty(path, nameof(path)); TransformationStrategy transformationStrategy = GetCombinedTransformationStrategyFromRuntimeOptions(); @@ -115,7 +131,6 @@ public NodeTaskBuilder WithInputUsingCombinedTransformationStrategy(string path, nodeTask.Inputs.Add( new FileInput() { - MountParentDirectory = mountParentDirectory, Path = path, SourceUrl = sourceUrl, TransformationStrategy = transformationStrategy @@ -133,10 +148,9 @@ public NodeTaskBuilder WithInputUsingCombinedTransformationStrategy(string path, /// /// /// - /// /// public NodeTaskBuilder WithOutputUsingCombinedTransformationStrategy(string path, string targetUrl, - FileType? fileType, string mountParentDirectory) + FileType? fileType) { ArgumentException.ThrowIfNullOrEmpty(path, nameof(path)); ArgumentException.ThrowIfNullOrEmpty(targetUrl, nameof(targetUrl)); @@ -144,7 +158,6 @@ public NodeTaskBuilder WithOutputUsingCombinedTransformationStrategy(string path nodeTask.Outputs.Add( new FileOutput() { - MountParentDirectory = mountParentDirectory, Path = path, TargetUrl = targetUrl, TransformationStrategy = GetCombinedTransformationStrategyFromRuntimeOptions(), @@ -155,49 +168,57 @@ public NodeTaskBuilder WithOutputUsingCombinedTransformationStrategy(string path } /// - /// Sets the commands to the NodeTask + /// Set the host path for the container root /// - /// + /// /// - /// - public NodeTaskBuilder WithContainerCommands(List commands) + public NodeTaskBuilder WithMountParentDirectory(string mountParentDirectory) { - ArgumentNullException.ThrowIfNull(commands); - - if (commands.Count == 0) - { - throw new InvalidOperationException("The list commands can't be empty"); - } - - nodeTask.CommandsToExecute = commands; - + nodeTask.MountParentDirectory = mountParentDirectory; return this; } /// - /// + /// Parses an Executor from a TesExecutor /// - /// + /// /// - public NodeTaskBuilder WithContainerImage(string image) + internal static Executor ConvertExecutor(Tes.Models.TesExecutor executor) { - ArgumentException.ThrowIfNullOrEmpty(image); + ArgumentNullException.ThrowIfNull(executor); - //check if the image name is a digest - if (image.Contains('@')) + if (executor.Command.Count == 0) { - var splitByDigest = image.Split('@', 2); - nodeTask.ImageName = splitByDigest[0]; - nodeTask.ImageTag = splitByDigest[1]; - return this; + throw new InvalidOperationException("The list commands can't be empty"); } - var splitByTag = image.Split(':', 2); + ArgumentException.ThrowIfNullOrWhiteSpace(executor.Image, nameof(executor)); - nodeTask.ImageName = splitByTag[0]; - nodeTask.ImageTag = splitByTag.Length == 2 ? splitByTag[1] : DefaultDockerImageTag; + Executor nodeExecutor = new(); - return this; + //check if the image name is a digest + if (executor.Image.Contains('@')) + { + var splitByDigest = executor.Image.Split('@', 2); + nodeExecutor.ImageName = splitByDigest[0]; + nodeExecutor.ImageTag = splitByDigest[1]; + } + else + { + var splitByTag = executor.Image.Split(':', 2); + nodeExecutor.ImageName = splitByTag[0]; + nodeExecutor.ImageTag = splitByTag.Length == 2 ? splitByTag[1] : DefaultDockerImageTag; + } + + nodeExecutor.CommandsToExecute = executor.Command; + nodeExecutor.ContainerWorkDir = executor.Workdir; + nodeExecutor.ContainerStdIn = executor.Stdin; + nodeExecutor.ContainerStdOut = executor.Stdout; + nodeExecutor.ContainerStdErr = executor.Stderr; + nodeExecutor.ContainerEnv = executor.Env; + nodeExecutor.IgnoreError = executor.IgnoreError ?? false; + + return nodeExecutor; } /// diff --git a/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs b/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs index 9b416eeff..8aaa7c962 100644 --- a/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs +++ b/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs @@ -129,16 +129,14 @@ public virtual async Task ToNodeTaskAsync(TesTask task, NodeTaskConver { var builder = new NodeTaskBuilder(); - //TODO: Revise this assumption (carried over from the current implementation) and consider Single() if in practice only one executor per task is supported. - var executor = task.Executors.First(); - builder.WithId(task.Id) .WithAzureCloudIdentityConfig(azureCloudIdentityConfig) + .WithMountParentDirectory(containerMountParentDirectory) .WithResourceIdManagedIdentity(GetNodeManagedIdentityResourceId(task, nodeTaskConversionOptions.GlobalManagedIdentity)) .WithAcrPullResourceIdManagedIdentity(nodeTaskConversionOptions.AcrPullIdentity) .WithWorkflowId(task.WorkflowId) - .WithContainerCommands(executor.Command) - .WithContainerImage(executor.Image) + .WithContainerVolumes(task.Volumes) + .WithExecutors(task.Executors) .WithStorageEventSink(storageAccessProvider.GetInternalTesBlobUrlWithoutSasToken(blobPath: string.Empty)) .WithLogPublisher(storageAccessProvider.GetInternalTesTaskBlobUrlWithoutSasToken(task, blobPath: string.Empty)) .WithDrsHubUrl(nodeTaskConversionOptions.DrsHubApiHost) @@ -184,8 +182,7 @@ private void AddTaskOutputs(TesTask task, NodeTaskBuilder builder) builder.WithOutputUsingCombinedTransformationStrategy( AppendParentDirectoryIfSet(path, $"%{NodeTaskBuilder.BatchTaskDirEnvVarName}%"), url.AbsoluteUri, - fileType: FileType.File, - mountParentDirectory: null); + fileType: FileType.File); } } @@ -262,7 +259,7 @@ private async Task BuildInputsAsync(TesTask task, NodeTaskBuilder builder, IList inputs.AddRange(distinctAdditionalInputs); } - await MapInputsAsync(inputs, pathParentDirectory, containerMountParentDirectory, builder); + await MapInputsAsync(inputs, pathParentDirectory, builder); } } @@ -535,12 +532,11 @@ private static void MapOutputs(List outputs, string pathParentDirecto outputs?.ForEach(output => { builder.WithOutputUsingCombinedTransformationStrategy( - AppendParentDirectoryIfSet(output.Path, pathParentDirectory), output.Url, ToNodeTaskFileType(output.Type), - containerMountParentDirectory); + AppendParentDirectoryIfSet(output.Path, pathParentDirectory), output.Url, ToNodeTaskFileType(output.Type)); }); } - private async Task MapInputsAsync(List inputs, string pathParentDirectory, string containerMountParentDirectory, + private async Task MapInputsAsync(List inputs, string pathParentDirectory, NodeTaskBuilder builder) { if (inputs is null || inputs.Count == 0) @@ -585,8 +581,7 @@ private async Task MapInputsAsync(List inputs, string pathParentDirect void AddInputToBuilder(string path, string url) { builder.WithInputUsingCombinedTransformationStrategy( - AppendParentDirectoryIfSet(path, pathParentDirectory), url, - containerMountParentDirectory); + AppendParentDirectoryIfSet(path, pathParentDirectory), url); } } From 725c0f1e13a0f9d15d5c733b35ac1dd2f1c69fba Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 3 Oct 2024 14:03:48 -0700 Subject: [PATCH 139/202] Add standard i/o to executors --- .../Events/EventsPublisherTests.cs | 4 +- src/Tes.Runner/Docker/DockerExecutor.cs | 106 +++++++++----- src/Tes.Runner/Docker/StreamLogReader.cs | 137 ------------------ src/Tes.Runner/Events/EventsPublisher.cs | 4 +- src/Tes.Runner/Logs/IStreamLogReader.cs | 2 +- src/Tes.Runner/Logs/StreamLogReader.cs | 65 ++++++--- src/Tes.RunnerCLI/Commands/CommandLauncher.cs | 2 +- 7 files changed, 122 insertions(+), 198 deletions(-) delete mode 100644 src/Tes.Runner/Docker/StreamLogReader.cs diff --git a/src/Tes.Runner.Test/Events/EventsPublisherTests.cs b/src/Tes.Runner.Test/Events/EventsPublisherTests.cs index 1b9e8197c..7029619bc 100644 --- a/src/Tes.Runner.Test/Events/EventsPublisherTests.cs +++ b/src/Tes.Runner.Test/Events/EventsPublisherTests.cs @@ -112,7 +112,7 @@ public async Task PublishExecutorStartEventAsync_EventIsPublished_EventContainsA var eventMessage = ((TestEventSink)sinks[0]).EventsHandled[0]; AssertMessageBaseMapping(eventMessage, EventsPublisher.ExecutorStartEvent, EventsPublisher.StartedStatus); - Assert.AreEqual("0 of 1", eventMessage.EventData!["executor"]); + Assert.AreEqual("1/1", eventMessage.EventData!["executor"]); Assert.AreEqual(nodeTask.Executors?[0].ImageName, eventMessage.EventData!["image"]); Assert.AreEqual(nodeTask.Executors?[0].ImageTag, eventMessage.EventData!["imageTag"]); Assert.AreEqual(nodeTask.Executors?[0].CommandsToExecute?.First(), eventMessage.EventData!["commands"]); @@ -127,7 +127,7 @@ public async Task PublishExecutorEndEventAsync_EventIsPublished_EventContainsAll var eventMessage = ((TestEventSink)sinks[0]).EventsHandled[0]; AssertMessageBaseMapping(eventMessage, EventsPublisher.ExecutorEndEvent, EventsPublisher.SuccessStatus); - Assert.AreEqual("0 of 1", eventMessage.EventData!["executor"]); + Assert.AreEqual("1/1", eventMessage.EventData!["executor"]); Assert.AreEqual(nodeTask.Executors?[0].ImageName, eventMessage.EventData!["image"]); Assert.AreEqual(nodeTask.Executors?[0].ImageTag, eventMessage.EventData!["imageTag"]); Assert.AreEqual(0, int.Parse(eventMessage.EventData!["exitCode"])); diff --git a/src/Tes.Runner/Docker/DockerExecutor.cs b/src/Tes.Runner/Docker/DockerExecutor.cs index 699224a19..8d1405dcb 100644 --- a/src/Tes.Runner/Docker/DockerExecutor.cs +++ b/src/Tes.Runner/Docker/DockerExecutor.cs @@ -148,62 +148,98 @@ public virtual async Task RunOnContainerAsync(Executio ArgumentException.ThrowIfNullOrEmpty(executionOptions.ImageName); ArgumentNullException.ThrowIfNull(executionOptions.CommandsToExecute); + Stream? stdIn = default; + Stream? stdErr = default; + Stream? stdOut = default; + try { + stdIn = GetFileStream(executionOptions.ContainerStdIn, forWrite: false); + stdErr = GetFileStream(executionOptions.ContainerStdErr, forWrite: true); + stdOut = GetFileStream(executionOptions.ContainerStdOut, forWrite: true); + try { - await PullImageWithRetriesAsync(executionOptions.ImageName, executionOptions.Tag); - } - catch (DockerApiException e) when (IsAuthFailure(e)) - { - var authConfig = await containerRegistryAuthorizationManager.TryGetAuthConfigForAzureContainerRegistryAsync(executionOptions.ImageName, executionOptions.Tag, executionOptions.RuntimeOptions); - - if (authConfig is not null) + try { - await PullImageWithRetriesAsync(executionOptions.ImageName, executionOptions.Tag, authConfig); + await PullImageWithRetriesAsync(executionOptions.ImageName, executionOptions.Tag); } - else + catch (DockerApiException e) when (IsAuthFailure(e)) { - throw; + var authConfig = await containerRegistryAuthorizationManager.TryGetAuthConfigForAzureContainerRegistryAsync(executionOptions.ImageName, executionOptions.Tag, executionOptions.RuntimeOptions); + + if (authConfig is not null) + { + await PullImageWithRetriesAsync(executionOptions.ImageName, executionOptions.Tag, authConfig); + } + else + { + throw; + } } } - } - catch - { - _ = await dockerClient.Images.PruneImagesAsync(); - throw; - } + catch + { + _ = await dockerClient.Images.PruneImagesAsync(); + throw; + } + + var imageWithTag = ToImageNameWithTag(executionOptions.ImageName, executionOptions.Tag); + SetLastImage(imageWithTag); - var imageWithTag = ToImageNameWithTag(executionOptions.ImageName, executionOptions.Tag); - SetLastImage(imageWithTag); + await ConfigureNetworkAsync(); - await ConfigureNetworkAsync(); + var createResponse = await CreateContainerAsync(imageWithTag, executionOptions.CommandsToExecute, executionOptions.VolumeBindings, executionOptions.WorkingDir, + stdIn is not null, executionOptions.ContainerEnv, executionOptions.ContainerDeviceRequests); + _ = await dockerClient.Containers.InspectContainerAsync(createResponse.ID); - var createResponse = await CreateContainerAsync(imageWithTag, executionOptions.CommandsToExecute, executionOptions.VolumeBindings, executionOptions.WorkingDir, - executionOptions.ContainerEnv, executionOptions.ContainerDeviceRequests); - _ = await dockerClient.Containers.InspectContainerAsync(createResponse.ID); + var logs = await StartContainerWithStreamingOutput(createResponse, stdIn is not null); - var logs = await StartContainerWithStreamingOutput(createResponse); + streamLogReader.StartReadingFromLogStreams(logs, stdIn, stdOut, stdErr); - streamLogReader.StartReadingFromLogStreams(logs); + var runResponse = await dockerClient.Containers.WaitContainerAsync(createResponse.ID); - var runResponse = await dockerClient.Containers.WaitContainerAsync(createResponse.ID); + await streamLogReader.WaitUntilAsync(TimeSpan.FromSeconds(LogStreamingMaxWaitTimeInSeconds)); - await streamLogReader.WaitUntilAsync(TimeSpan.FromSeconds(LogStreamingMaxWaitTimeInSeconds)); + return new ContainerExecutionResult(createResponse.ID, runResponse.Error?.Message, runResponse.StatusCode); + } + finally + { + stdOut?.Dispose(); + stdErr?.Dispose(); + stdIn?.Dispose(); + } + } - return new ContainerExecutionResult(createResponse.ID, runResponse.Error?.Message, runResponse.StatusCode); + private static FileStream? GetFileStream(string? path, bool forWrite) + { + if (string.IsNullOrWhiteSpace(path)) + { + return default; + } + + if (File.Exists(path) || forWrite) + { + return forWrite + ? File.OpenWrite(path) + : File.OpenRead(path); + } + else + { + throw new FileNotFoundException(null, path); + } } - private async Task StartContainerWithStreamingOutput(CreateContainerResponse createResponse) + private async Task StartContainerWithStreamingOutput(CreateContainerResponse createResponse, bool streamStdIn) { - var logs = await StreamStdOutAndErrorAsync(createResponse.ID); + var logs = await StreamStdInOutAndErrorAsync(createResponse.ID, streamStdIn); await dockerClient.Containers.StartContainerAsync(createResponse.ID, new ContainerStartParameters()); return logs; } - private async Task StreamStdOutAndErrorAsync(string containerId) + private async Task StreamStdInOutAndErrorAsync(string containerId, bool streamStdIn) { return await dockerClient.Containers.AttachContainerAsync( containerId, @@ -211,6 +247,7 @@ private async Task StreamStdOutAndErrorAsync(string container new ContainerAttachParameters { Stream = true, + Stdin = streamStdIn, Stdout = true, Stderr = true }); @@ -218,7 +255,7 @@ private async Task StreamStdOutAndErrorAsync(string container private async Task CreateContainerAsync(string imageWithTag, List commandsToExecute, List? volumeBindings, string? workingDir, - IDictionary? env = default, List? deviceRequests = default) + bool streamStdIn, IDictionary? env = default, List? deviceRequests = default) { logger.LogInformation(@"Creating container with image name: {ImageWithTag}", imageWithTag); @@ -228,8 +265,11 @@ private async Task CreateContainerAsync(string imageWit Image = imageWithTag, Cmd = commandsToExecute, Env = env?.Select(pair => $"{pair.Key}={pair.Value}").ToList(), - AttachStdout = true, - AttachStderr = true, + AttachStdin = false, + AttachStdout = true, // false + AttachStderr = true, // false + OpenStdin = streamStdIn, + StdinOnce = streamStdIn, WorkingDir = workingDir, HostConfig = new() { diff --git a/src/Tes.Runner/Docker/StreamLogReader.cs b/src/Tes.Runner/Docker/StreamLogReader.cs deleted file mode 100644 index b570ad4d1..000000000 --- a/src/Tes.Runner/Docker/StreamLogReader.cs +++ /dev/null @@ -1,137 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System.Text; -using Docker.DotNet; -using Microsoft.Extensions.Logging; -using Tes.Runner.Logs; -using Tes.Runner.Transfer; - -namespace Tes.Runner.Docker -{ - public abstract class StreamLogReader : IStreamLogReader - { - const int KiB = 1024; - private readonly ILogger logger = PipelineLoggerFactory.Create(); - protected Task? Reader; - - public abstract Task AppendStandardOutputAsync(string data); - public abstract Task AppendStandardErrAsync(string data); - - public abstract void OnComplete(Exception? err); - - public void StartReadingFromLogStreams(MultiplexedStream multiplexedStream) - { - if (Reader is not null) - { - throw new InvalidOperationException("Reader is already started"); - } - - Reader = Task.Run(async () => await ReadOutputToEndAsync(multiplexedStream)); - } - - public void StartReadingFromLogStreams(StreamReader stdOut, StreamReader stdErr) - { - if (Reader is not null) - { - throw new InvalidOperationException("Reader is already started"); - } - - var stdOutReader = Task.Run(async () => await ReadOutputToEndAsync(stdOut, StreamSource.StandardOut)); - var stdErrReader = Task.Run(async () => await ReadOutputToEndAsync(stdErr, StreamSource.StandardErr)); - - Reader = Task.WhenAll(stdErrReader, stdOutReader); - } - - public async Task WaitUntilAsync(TimeSpan timeout) - { - try - { - if (Reader is null) - { - throw new InvalidOperationException("Stream reading has not been started"); - } - await Task.WhenAll(Reader).WaitAsync(timeout); - - OnComplete(default); - } - catch (Exception? e) - { - OnComplete(e); - throw; - } - } - - private async Task ReadOutputToEndAsync(MultiplexedStream multiplexedStream) - { - try - { - var buffer = new byte[16 * KiB]; //16K at the time - using (multiplexedStream) - { - var result = await multiplexedStream.ReadOutputAsync(buffer, 0, buffer.Length, CancellationToken.None); - - while (!result.EOF) - { - var data = Encoding.UTF8.GetString(buffer, 0, result.Count); - - if (result.Target == MultiplexedStream.TargetStream.StandardOut) - { - await AppendStandardOutputAsync(data); - } - else if (result.Target == MultiplexedStream.TargetStream.StandardError) - { - await AppendStandardErrAsync(data); - } - - result = await multiplexedStream.ReadOutputAsync(buffer, 0, buffer.Length, CancellationToken.None); - } - } - } - - catch (Exception e) - { - logger.LogError(e, "Failed read form the multiplexed stream"); - } - } - - private async Task ReadOutputToEndAsync(StreamReader streamSource, StreamSource source) - { - try - { - var buffer = new Memory(new char[16 * KiB]); //16K at the time - using (streamSource) - { - while (!streamSource.EndOfStream) - { - var result = await streamSource.ReadAsync(buffer, CancellationToken.None); - - switch (source) - { - case StreamSource.StandardOut: - await AppendStandardOutputAsync(result.ToString()); - break; - - case StreamSource.StandardErr: - await AppendStandardErrAsync(result.ToString()); - break; - - default: - throw new ArgumentOutOfRangeException(nameof(source), source, null); - } - } - } - } - catch (Exception e) - { - logger.LogError(e, "Failed read form the multiplexed stream"); - } - } - } - - internal enum StreamSource - { - StandardOut, - StandardErr, - } -} diff --git a/src/Tes.Runner/Events/EventsPublisher.cs b/src/Tes.Runner/Events/EventsPublisher.cs index af7d007c2..8f2108c1d 100644 --- a/src/Tes.Runner/Events/EventsPublisher.cs +++ b/src/Tes.Runner/Events/EventsPublisher.cs @@ -120,7 +120,7 @@ public virtual async Task PublishExecutorStartEventAsync(NodeTask nodeTask, int eventMessage.EventData = new() { - { "executor", $"{selector} of {nodeTask.Executors?.Count ?? 0}" }, + { "executor", $"{selector + 1}/{nodeTask.Executors?.Count ?? 0}" }, { "image", executor?.ImageName ?? string.Empty}, { "imageTag", executor?.ImageTag ?? string.Empty}, { "commands", string.Join(' ', commands) } @@ -137,7 +137,7 @@ public virtual async Task PublishExecutorEndEventAsync(NodeTask nodeTask, int se eventMessage.EventData = new() { - { "executor", $"{selector} of {nodeTask.Executors?.Count ?? 0}" }, + { "executor", $"{selector + 1}/{nodeTask.Executors?.Count ?? 0}" }, { "image", executor?.ImageName ?? string.Empty}, { "imageTag", executor?.ImageTag ?? string.Empty}, { "exitCode", exitCode.ToString()}, diff --git a/src/Tes.Runner/Logs/IStreamLogReader.cs b/src/Tes.Runner/Logs/IStreamLogReader.cs index c082a7b56..336705e20 100644 --- a/src/Tes.Runner/Logs/IStreamLogReader.cs +++ b/src/Tes.Runner/Logs/IStreamLogReader.cs @@ -7,7 +7,7 @@ namespace Tes.Runner.Logs; public interface IStreamLogReader { - void StartReadingFromLogStreams(MultiplexedStream multiplexedStream); + void StartReadingFromLogStreams(MultiplexedStream multiplexedStream, Stream? stdIn = default, Stream? stdOut = default, Stream? stdErr = default); void StartReadingFromLogStreams(StreamReader stdOut, StreamReader stdErr); Task WaitUntilAsync(TimeSpan timeout); } diff --git a/src/Tes.Runner/Logs/StreamLogReader.cs b/src/Tes.Runner/Logs/StreamLogReader.cs index c7376a7b9..5c9ec2968 100644 --- a/src/Tes.Runner/Logs/StreamLogReader.cs +++ b/src/Tes.Runner/Logs/StreamLogReader.cs @@ -19,25 +19,28 @@ public abstract class StreamLogReader : IStreamLogReader public abstract void OnComplete(Exception? err); - public void StartReadingFromLogStreams(MultiplexedStream multiplexedStream) + public void StartReadingFromLogStreams(MultiplexedStream multiplexedStream, Stream? stdIn, Stream? stdOut, Stream? stdErr) { if (Reader is not null) { - throw new InvalidOperationException("Reader is already started"); + throw new InvalidOperationException("Reader was already started"); } - Reader = Task.Run(async () => await ReadOutputToEndAsync(multiplexedStream)); + var multiplexReader = ReadOutputToEndAsync(multiplexedStream, stdOut, stdErr); + var stdInWriter = stdIn is null ? Task.CompletedTask : WriteInputStream(multiplexedStream, stdIn); + + Reader = Task.WhenAll(multiplexReader, stdInWriter); } public void StartReadingFromLogStreams(StreamReader stdOut, StreamReader stdErr) { if (Reader is not null) { - throw new InvalidOperationException("Reader is already started"); + throw new InvalidOperationException("Reader was already started"); } - var stdOutReader = Task.Run(async () => await ReadOutputToEndAsync(stdOut, StreamSource.StandardOut)); - var stdErrReader = Task.Run(async () => await ReadOutputToEndAsync(stdErr, StreamSource.StandardErr)); + var stdOutReader = ReadOutputToEndAsync(stdOut, StreamSource.StandardOut); + var stdErrReader = ReadOutputToEndAsync(stdErr, StreamSource.StandardErr); Reader = Task.WhenAll(stdErrReader, stdOutReader); } @@ -50,8 +53,8 @@ public async Task WaitUntilAsync(TimeSpan timeout) { throw new InvalidOperationException("Stream reading has not been started"); } - await Task.WhenAll(Reader).WaitAsync(timeout); + await Reader.WaitAsync(timeout); OnComplete(default); } catch (Exception? e) @@ -61,28 +64,46 @@ public async Task WaitUntilAsync(TimeSpan timeout) } } - private async Task ReadOutputToEndAsync(MultiplexedStream multiplexedStream) + private static async Task WriteInputStream(MultiplexedStream multiplexedStream, Stream stream) + { + await multiplexedStream.CopyFromAsync(stream, CancellationToken.None); + multiplexedStream.CloseWrite(); + } + + private async Task ReadOutputToEndAsync(MultiplexedStream multiplexedStream, Stream? stdOut, Stream? stdErr) { try { - var buffer = new byte[16 * KiB]; //8K at the time + var buffer = new byte[16 * KiB]; //16K at the time using (multiplexedStream) { - var result = await multiplexedStream.ReadOutputAsync(buffer, 0, buffer.Length, CancellationToken.None); - - while (!result.EOF) + for (var result = await multiplexedStream.ReadOutputAsync(buffer, 0, buffer.Length, CancellationToken.None); + !result.EOF; + result = await multiplexedStream.ReadOutputAsync(buffer, 0, buffer.Length, CancellationToken.None)) { - var data = Encoding.UTF8.GetString(buffer, 0, result.Count); - if (result.Target == MultiplexedStream.TargetStream.StandardOut) - { - await AppendStandardOutputAsync(data); - } - else if (result.Target == MultiplexedStream.TargetStream.StandardError) + var data = buffer.AsMemory(0, result.Count); + var text = Encoding.UTF8.GetString(data.Span); + + switch (result.Target) { - await AppendStandardErrAsync(data); - } + case MultiplexedStream.TargetStream.StandardOut: + await AppendStandardOutputAsync(text); + + if (stdOut is not null) + { + await stdOut.WriteAsync(data); + } + break; - result = await multiplexedStream.ReadOutputAsync(buffer, 0, buffer.Length, CancellationToken.None); + case MultiplexedStream.TargetStream.StandardError: + await AppendStandardErrAsync(text); + + if (stdErr is not null) + { + await stdErr.WriteAsync(data); + } + break; + } } } } @@ -122,7 +143,7 @@ private async Task ReadOutputToEndAsync(StreamReader streamSource, StreamSource } catch (Exception e) { - logger.LogError(e, "Failed read and process stream"); + logger.LogError(e, "Failed to read and process stream"); } } } diff --git a/src/Tes.RunnerCLI/Commands/CommandLauncher.cs b/src/Tes.RunnerCLI/Commands/CommandLauncher.cs index cd312fcef..38996390b 100644 --- a/src/Tes.RunnerCLI/Commands/CommandLauncher.cs +++ b/src/Tes.RunnerCLI/Commands/CommandLauncher.cs @@ -84,7 +84,7 @@ public static async Task LaunchesExecutorCommandAsSubProcessAsync(Runner.Models. try { - var processLauncher = await ProcessLauncher.CreateLauncherAsync(nodeTask, logNamePrefix: CommandFactory.ExecutorCommandName, apiVersion: apiVersion); + var processLauncher = await ProcessLauncher.CreateLauncherAsync(nodeTask, logNamePrefix: $"{CommandFactory.ExecutorCommandName}-{selector:D3}", apiVersion: apiVersion); var args = new List() { CommandFactory.ExecutorCommandName, From f860cd45f4e56d0b0704cd77da62f87e2555e62f Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 3 Oct 2024 16:50:25 -0700 Subject: [PATCH 140/202] Convert between container and host filesystems --- .../Docker/DockerExecutorTests.cs | 20 ++++++++++ src/Tes.Runner/Docker/DockerExecutor.cs | 8 ++-- src/Tes.Runner/Host/AzureBatchRunnerHost.cs | 35 ++++++++++++++++ src/Tes.Runner/Host/RunnerHost.cs | 40 +++++++++++++++++++ 4 files changed, 100 insertions(+), 3 deletions(-) diff --git a/src/Tes.Runner.Test/Docker/DockerExecutorTests.cs b/src/Tes.Runner.Test/Docker/DockerExecutorTests.cs index bf0aaf993..a8842560c 100644 --- a/src/Tes.Runner.Test/Docker/DockerExecutorTests.cs +++ b/src/Tes.Runner.Test/Docker/DockerExecutorTests.cs @@ -157,6 +157,26 @@ public override FileInfo GetSharedFile(string name) return file; } + public override string GetTaskWorkingContainerPath(FileInfo file) + { + throw new NotImplementedException(); + } + + public override string GetTaskWorkingContainerPath(DirectoryInfo directory) + { + throw new NotImplementedException(); + } + + public override DirectoryInfo GetTaskWorkingHostDirectory(string path) + { + throw new NotImplementedException(); + } + + public override FileInfo GetTaskWorkingHostFile(string path) + { + throw new NotImplementedException(); + } + public override Task NodeCleanupPreviousTasksAsync() { throw new NotSupportedException(); diff --git a/src/Tes.Runner/Docker/DockerExecutor.cs b/src/Tes.Runner/Docker/DockerExecutor.cs index 8d1405dcb..31132324c 100644 --- a/src/Tes.Runner/Docker/DockerExecutor.cs +++ b/src/Tes.Runner/Docker/DockerExecutor.cs @@ -218,11 +218,13 @@ public virtual async Task RunOnContainerAsync(Executio return default; } - if (File.Exists(path) || forWrite) + var file = Executor.RunnerHost.GetTaskWorkingHostFile(path); + + if (file.Exists || forWrite) { return forWrite - ? File.OpenWrite(path) - : File.OpenRead(path); + ? file.OpenWrite() + : file.OpenRead(); } else { diff --git a/src/Tes.Runner/Host/AzureBatchRunnerHost.cs b/src/Tes.Runner/Host/AzureBatchRunnerHost.cs index 194ff8c18..92cccd87a 100644 --- a/src/Tes.Runner/Host/AzureBatchRunnerHost.cs +++ b/src/Tes.Runner/Host/AzureBatchRunnerHost.cs @@ -8,6 +8,41 @@ internal class AzureBatchRunnerHost : RunnerHost private const string NodeRootDir = "AZ_BATCH_NODE_ROOT_DIR"; private const string NodeSharedDir = "AZ_BATCH_NODE_SHARED_DIR"; private const string NodeTaskDir = "AZ_BATCH_TASK_DIR"; + private const string NodeTaskWorkDir = "AZ_BATCH_TASK_WORKING_DIR"; + + public override string GetTaskWorkingContainerPath(FileInfo file) + { + var path = Path.GetRelativePath(Environment.GetEnvironmentVariable(NodeTaskWorkDir) ?? throw new InvalidOperationException("Task working directory not found"), file.FullName); + + if (path == file.FullName) + { + throw new InvalidOperationException("File is not in container."); + } + + return $"/{path.TrimStart('/')}"; + } + + public override string GetTaskWorkingContainerPath(DirectoryInfo directory) + { + var path = Path.GetRelativePath(Environment.GetEnvironmentVariable(NodeTaskWorkDir) ?? throw new InvalidOperationException("Task working directory not found"), directory.FullName); + + if (path == directory.FullName) + { + throw new InvalidOperationException("File is not in container."); + } + + return $"/{path.TrimStart('/')}"; + } + + public override DirectoryInfo GetTaskWorkingHostDirectory(string path) + { + return new(Path.Combine(Environment.GetEnvironmentVariable(NodeTaskWorkDir) ?? throw new InvalidOperationException("Task working directory not found"), path.TrimStart('/'))); + } + + public override FileInfo GetTaskWorkingHostFile(string path) + { + return new(Path.Combine(Environment.GetEnvironmentVariable(NodeTaskWorkDir) ?? throw new InvalidOperationException("Task working directory not found"), path.TrimStart('/'))); + } /// public override FileInfo GetSharedFile(string path) diff --git a/src/Tes.Runner/Host/RunnerHost.cs b/src/Tes.Runner/Host/RunnerHost.cs index 469ae6c41..0dd06e626 100644 --- a/src/Tes.Runner/Host/RunnerHost.cs +++ b/src/Tes.Runner/Host/RunnerHost.cs @@ -34,6 +34,34 @@ public interface IRunnerHost /// Task NodeCleanupPreviousTasksAsync(); + /// + /// Gets container path for file in task working directory. + /// + /// + /// + string GetTaskWorkingContainerPath(FileInfo file); + + /// + /// Gets container path for directory in task working directory. + /// + /// + /// + string GetTaskWorkingContainerPath(DirectoryInfo directory); + + /// + /// Gets file on host for path in task container. + /// + /// + /// + FileInfo GetTaskWorkingHostFile(string path); + + /// + /// Gets directory on host for path in task container. + /// + /// + /// + DirectoryInfo GetTaskWorkingHostDirectory(string path); + //void WriteMetric(string key); //void WriteMetric(string key, string value); @@ -47,6 +75,18 @@ internal abstract class RunnerHost : IRunnerHost /// public abstract Task NodeCleanupPreviousTasksAsync(); + /// + public abstract string GetTaskWorkingContainerPath(FileInfo file); + + /// + public abstract string GetTaskWorkingContainerPath(DirectoryInfo directory); + + /// + public abstract DirectoryInfo GetTaskWorkingHostDirectory(string path); + + /// + public abstract FileInfo GetTaskWorkingHostFile(string path); + ///// //public abstract void WriteMetric(string key, string value); From 3e9541cc96741e6acbc7a92bc0662a2f7d95ecb8 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 3 Oct 2024 17:29:16 -0700 Subject: [PATCH 141/202] Fix log gathering --- src/TesApi.Web/BatchScheduler.cs | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 901e03020..ac770f1d1 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -296,23 +296,28 @@ private async Task AddProcessLogsIfAvailable(TesTask tesTask, CancellationToken // https://github.com/microsoft/ga4gh-tes/blob/6e120d33f78c7a36cffe953c74b55cba7cfbf7fc/src/Tes.Runner/Logs/AppendBlobLogPublisher.cs#L39 // Get any logs the task runner left. Look for the latest set in this order: upload, exec, download - foreach (var prefix in new[] { "upload_std", "exec_std", "download_std" }) + foreach (var prefix in new[] { "upload_std", "exec-", "download_std" }) { var logs = FilterByPrefix(directoryUri, prefix, await azureProxy.ListBlobsAsync(directoryUri, cancellationToken)); if (logs.Any()) { - if (prefix.StartsWith("exec_")) + if (prefix.Equals("exec-")) { - var log = tesTask.GetOrAddTesTaskLog().GetOrAddExecutorLog(); + var taskLog = tesTask.GetOrAddTesTaskLog(); - foreach (var (type, action) in new (string, Action)[] { ("stderr", list => log.Stderr = list), ("stdout", list => log.Stdout = list) }) + foreach (var selector in logs.Select(blob => blob.BlobNameParts[0]).Distinct()) { - var list = logs.Where(blob => type.Equals(blob.BlobNameParts[1], StringComparison.OrdinalIgnoreCase)).ToList(); + var log = taskLog.GetOrAddExecutorLog(); - if (list.Any()) + foreach (var (type, action) in new (string, Action)[] { ("stderr", list => log.Stderr = list), ("stdout", list => log.Stdout = list) }) { - action(JsonArray(list.Select(blob => blob.BlobUri.AbsoluteUri))); + var list = logs.Where(blob => selector.Equals(blob.BlobNameParts[0]) && type.Equals(blob.BlobNameParts[1], StringComparison.OrdinalIgnoreCase)).ToList(); + + if (list.Any()) + { + action(JsonArray(list.Select(blob => blob.BlobUri.AbsoluteUri))); + } } } } From 56d11489bae6de9fe9da8d59efb8f0e221f8c8b6 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 4 Oct 2024 12:18:30 -0700 Subject: [PATCH 142/202] append instead of truncate --- src/Tes.Runner/Docker/DockerExecutor.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Tes.Runner/Docker/DockerExecutor.cs b/src/Tes.Runner/Docker/DockerExecutor.cs index 31132324c..5de2f4642 100644 --- a/src/Tes.Runner/Docker/DockerExecutor.cs +++ b/src/Tes.Runner/Docker/DockerExecutor.cs @@ -223,7 +223,7 @@ public virtual async Task RunOnContainerAsync(Executio if (file.Exists || forWrite) { return forWrite - ? file.OpenWrite() + ? file.Open(FileMode.Append) : file.OpenRead(); } else From edd6d1778fd84b2e71fa511fe2b33611fe7e1415 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 8 Oct 2024 20:46:44 -0700 Subject: [PATCH 143/202] cleanup merge --- src/CommonUtilities/Models/NodeTask.cs | 7 ++-- .../Docker/VolumeBindingsGeneratorTests.cs | 7 +--- src/Tes.Runner/Executor.cs | 2 +- src/Tes.Runner/Logs/StreamLogReader.cs | 39 +++++++++---------- src/TesApi.Web/Runner/NodeTaskBuilder.cs | 8 ++-- 5 files changed, 29 insertions(+), 34 deletions(-) diff --git a/src/CommonUtilities/Models/NodeTask.cs b/src/CommonUtilities/Models/NodeTask.cs index a154a315b..3d380d8de 100644 --- a/src/CommonUtilities/Models/NodeTask.cs +++ b/src/CommonUtilities/Models/NodeTask.cs @@ -10,7 +10,6 @@ public class NodeTask { public string? Id { get; set; } public string? WorkflowId { get; set; } - public string? MountParentDirectory { get; set; } public List? Executors { get; set; } public List? ContainerDeviceRequests { get; set; } public List? ContainerVolumes { get; set; } @@ -31,9 +30,9 @@ public class Executor public string? ImageName { get; set; } public string? ContainerWorkDir { get; set; } public List? CommandsToExecute { get; set; } - public string? ContainerStdIn { get; set; } - public string? ContainerStdOut { get; set; } - public string? ContainerStdErr { get; set; } + public string? ContainerStdInPath { get; set; } + public string? ContainerStdOutPath { get; set; } + public string? ContainerStdErrPath { get; set; } public Dictionary? ContainerEnv { get; set; } public bool IgnoreError { get; set; } } diff --git a/src/Tes.Runner.Test/Docker/VolumeBindingsGeneratorTests.cs b/src/Tes.Runner.Test/Docker/VolumeBindingsGeneratorTests.cs index 004b2e7c3..d6f8b5df8 100644 --- a/src/Tes.Runner.Test/Docker/VolumeBindingsGeneratorTests.cs +++ b/src/Tes.Runner.Test/Docker/VolumeBindingsGeneratorTests.cs @@ -26,7 +26,6 @@ public void SetUp() [DataRow("/wkd/input/file.bam", "/wkd/", "/wkd/input:/input")] public void GenerateVolumeBindings_SingleInputWithWorkingDir_SingleVolumeBinding(string path, string mountParent, string expected) { - var volumeBindingsGenerator = new VolumeBindingsGenerator(mountParent); var input = new FileInput() { Path = path }; var bindings = new VolumeBindingsGenerator(mountParent).GenerateVolumeBindings(new List() { input }, outputs: default); @@ -39,7 +38,6 @@ public void GenerateVolumeBindings_SingleInputWithWorkingDir_SingleVolumeBinding [DataRow("/wkd/output/file.bam", "/wkd/", "/wkd/output:/output")] public void GenerateVolumeBindings_SingleOutputWithWorkingDir_SingleVolumeBinding(string path, string mountParent, string expected) { - var volumeBindingsGenerator = new VolumeBindingsGenerator(mountParent); var output = new FileOutput() { Path = path }; var bindings = new VolumeBindingsGenerator(mountParent).GenerateVolumeBindings(inputs: default, new List() { output }); @@ -53,7 +51,6 @@ public void GenerateVolumeBindings_SingleOutputWithWorkingDir_SingleVolumeBindin [DataRow("/wkd", "/wkd/output:/output", "/wkd/output/file.bam", "/wkd/output/dir1/file1.bam", "/wkd/output/dir2/file1.bam")] public void GenerateVolumeBindings_OutputsWithWorkingDir_SingleVolumeBinding(string mountParent, string expected, params string[] paths) { - var volumeBindingsGenerator = new VolumeBindingsGenerator(mountParent); var outputs = paths.Select(p => new FileOutput() { Path = p }).ToList(); var bindings = new VolumeBindingsGenerator(mountParent).GenerateVolumeBindings(inputs: default, outputs); @@ -66,7 +63,6 @@ public void GenerateVolumeBindings_OutputsWithWorkingDir_SingleVolumeBinding(str [DataRow("/wkd", "/wkd/output:/output", "/wkd/out:/out", "/wkd/out/dir1/file1.bam", "/wkd/output/dir2/file1.bam")] public void GenerateVolumeBindings_OutputsWitDifferentParentsAfterWd_TwoVolumeBinding(string mountParent, string expected1, string expected2, params string[] paths) { - var volumeBindingsGenerator = new VolumeBindingsGenerator(mountParent); var outputs = paths.Select(p => new FileOutput() { Path = p }).ToList(); var bindings = new VolumeBindingsGenerator(mountParent).GenerateVolumeBindings(inputs: default, outputs); @@ -79,13 +75,14 @@ public void GenerateVolumeBindings_OutputsWitDifferentParentsAfterWd_TwoVolumeBi [TestMethod] public void GenerateVolumeBindings_MultipleInputsAndOutputsWitDifferentParentsAfterWd_TwoVolumeBinding() { + var mountParent = "/wkd"; var paths = new string[] { "/wkd/outputs/f.bam", "/wkd/outputs/b.bam" }; var outputs = paths.Select(p => new FileOutput() { Path = p }).ToList(); paths = new string[] { "/wkd/inputs/f.bam", "/wkd/inputs/b.bam" }; var inputs = paths.Select(p => new FileInput() { Path = p }).ToList(); - var bindings = new VolumeBindingsGenerator("/wkd").GenerateVolumeBindings(inputs, outputs); + var bindings = new VolumeBindingsGenerator(mountParent).GenerateVolumeBindings(inputs, outputs); Assert.AreEqual(2, bindings.Count); Assert.IsTrue(bindings.Any(p => p.Equals("/wkd/inputs:/inputs", StringComparison.OrdinalIgnoreCase))); diff --git a/src/Tes.Runner/Executor.cs b/src/Tes.Runner/Executor.cs index d8252e69b..ada870e86 100644 --- a/src/Tes.Runner/Executor.cs +++ b/src/Tes.Runner/Executor.cs @@ -80,7 +80,7 @@ private ExecutionOptions CreateExecutionOptions(Models.Executor executor, List public NodeTaskBuilder WithMountParentDirectory(string mountParentDirectory) { - nodeTask.MountParentDirectory = mountParentDirectory; + nodeTask.MountParentDirectoryPath = mountParentDirectory; return this; } @@ -223,9 +223,9 @@ internal static Executor ConvertExecutor(Tes.Models.TesExecutor executor) nodeExecutor.CommandsToExecute = executor.Command; nodeExecutor.ContainerWorkDir = executor.Workdir; - nodeExecutor.ContainerStdIn = executor.Stdin; - nodeExecutor.ContainerStdOut = executor.Stdout; - nodeExecutor.ContainerStdErr = executor.Stderr; + nodeExecutor.ContainerStdInPath = executor.Stdin; + nodeExecutor.ContainerStdOutPath = executor.Stdout; + nodeExecutor.ContainerStdErrPath = executor.Stderr; nodeExecutor.ContainerEnv = executor.Env; nodeExecutor.IgnoreError = executor.IgnoreError ?? false; From 64b35c4dcd5dde835f51dfc08eede8b206bafe95 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 9 Oct 2024 11:40:00 -0700 Subject: [PATCH 144/202] fix unit tests --- .../Commands/NodeTaskResolverTests.cs | 6 +- .../ResolutionPolicyHandlerTests.cs | 2 +- .../Storage/FileOperationResolverTests.cs | 2 +- src/TesApi.Tests/BatchPoolTests.cs | 33 ++++----- src/TesApi.Tests/BatchSchedulerTests.cs | 69 +++++-------------- 5 files changed, 35 insertions(+), 77 deletions(-) diff --git a/src/Tes.Runner.Test/Commands/NodeTaskResolverTests.cs b/src/Tes.Runner.Test/Commands/NodeTaskResolverTests.cs index d19f65c78..704e5e662 100644 --- a/src/Tes.Runner.Test/Commands/NodeTaskResolverTests.cs +++ b/src/Tes.Runner.Test/Commands/NodeTaskResolverTests.cs @@ -133,7 +133,7 @@ public async Task ResolveNodeTaskAsyncWithUriWhenFileExistsDoesNotDownload() public async Task ResolveNodeTaskAsyncWithUriWhenFileNotExistsDoesDownload() { ConfigureBlobApiHttpUtils((_, _) => Task.FromResult(new HttpResponseMessage(HttpStatusCode.OK) { Content = new StringContent(@"{}") })); - SetEnvironment(new() { RuntimeOptions = new() }); + SetEnvironment(new() { RuntimeOptions = new() { MountParentDirectoryPath = Environment.CurrentDirectory + "/task" } }); taskFile = new(Path.Combine(Environment.CurrentDirectory, CommandFactory.DefaultTaskDefinitionFile)); Assert.IsFalse(taskFile.Exists); @@ -147,7 +147,7 @@ public async Task ResolveNodeTaskAsyncWithUriWhenFileNotExistsDoesDownload() public async Task ResolveNodeTaskAsyncWithUriWhenFileNotExistsDoesSave() { ConfigureBlobApiHttpUtils((_, _) => Task.FromResult(new HttpResponseMessage(HttpStatusCode.OK) { Content = new StringContent(@"{}") })); - SetEnvironment(new() { RuntimeOptions = new() }); + SetEnvironment(new() { RuntimeOptions = new() { MountParentDirectoryPath = Environment.CurrentDirectory + "/task" } }); taskFile = new(Path.Combine(Environment.CurrentDirectory, CommandFactory.DefaultTaskDefinitionFile)); Assert.IsFalse(taskFile.Exists); @@ -165,7 +165,7 @@ public async Task ResolveNodeTaskAsyncUsesResolutionPolicyResolver() var sendGetCalled = false; ConfigureBlobApiHttpUtils((request, _) => Task.FromResult(Send(request)), (options, apiVersion) => new MockableResolutionPolicyHandler(ApplySasResolutionToUrl, options, apiVersion)); - SetEnvironment(new() { RuntimeOptions = new() { Terra = new() }, TransformationStrategy = TransformationStrategy.CombinedTerra }); + SetEnvironment(new() { RuntimeOptions = new() { Terra = new(), MountParentDirectoryPath = Environment.CurrentDirectory + "/task" }, TransformationStrategy = TransformationStrategy.CombinedTerra }); taskFile = new(Path.Combine(Environment.CurrentDirectory, CommandFactory.DefaultTaskDefinitionFile)); var result = await nodeTaskResolver.ResolveNodeTaskAsync(file: taskFile, uri: new("http://localhost/task.json"), apiVersion: BlobPipelineOptions.DefaultApiVersion, saveDownload: false); diff --git a/src/Tes.Runner.Test/ResolutionPolicyHandlerTests.cs b/src/Tes.Runner.Test/ResolutionPolicyHandlerTests.cs index 14844609d..62c033bea 100644 --- a/src/Tes.Runner.Test/ResolutionPolicyHandlerTests.cs +++ b/src/Tes.Runner.Test/ResolutionPolicyHandlerTests.cs @@ -16,7 +16,7 @@ public class ResolutionPolicyHandlerTests [TestInitialize] public void SetUp() { - runtimeOptions = new RuntimeOptions(); + runtimeOptions = new RuntimeOptions() { MountParentDirectoryPath = "/task" }; resolutionPolicyHandler = new ResolutionPolicyHandler(runtimeOptions, Runner.Transfer.BlobPipelineOptions.DefaultApiVersion); } diff --git a/src/Tes.Runner.Test/Storage/FileOperationResolverTests.cs b/src/Tes.Runner.Test/Storage/FileOperationResolverTests.cs index ec8cdf06c..8dc84878b 100644 --- a/src/Tes.Runner.Test/Storage/FileOperationResolverTests.cs +++ b/src/Tes.Runner.Test/Storage/FileOperationResolverTests.cs @@ -22,7 +22,7 @@ public class FileOperationResolverTests [TestInitialize] public void SetUp() { - resolutionPolicyHandler = new(new(), BlobPipelineOptions.DefaultApiVersion); + resolutionPolicyHandler = new(new() { MountParentDirectoryPath = Environment.CurrentDirectory }, BlobPipelineOptions.DefaultApiVersion); singleFileInput = new() { diff --git a/src/TesApi.Tests/BatchPoolTests.cs b/src/TesApi.Tests/BatchPoolTests.cs index afbd0b780..2005fb96c 100644 --- a/src/TesApi.Tests/BatchPoolTests.cs +++ b/src/TesApi.Tests/BatchPoolTests.cs @@ -423,7 +423,7 @@ internal static AutoScaleRun GenerateAutoScaleRun(Microsoft.Azure.Batch.Protocol { var protocolObject = new Microsoft.Azure.Batch.Protocol.Models.AutoScaleRun(DateTime.UtcNow, results, error); var autoScaleRun = (AutoScaleRun)typeof(AutoScaleRun).GetConstructor(System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance, default, [typeof(Microsoft.Azure.Batch.Protocol.Models.AutoScaleRun)], default) - .Invoke(new object[] { protocolObject }); + .Invoke([protocolObject]); return autoScaleRun; } @@ -451,12 +451,7 @@ internal static CloudPool GeneratePool( metadata ??= []; - Mock computeNodeOperations = new(); - MockServiceClient batchServiceClient = new(computeNodeOperations.Object); - var protocolLayer = typeof(Microsoft.Azure.Batch.Protocol.BatchServiceClient).Assembly.GetType("ProtocolLayer").GetConstructor(System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic, null, [typeof(Microsoft.Azure.Batch.Protocol.BatchServiceClient)], null) - .Invoke([batchServiceClient]); - var parentClient = (BatchClient)typeof(BatchClient).GetConstructor(System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic, null, [typeof(Microsoft.Azure.Batch.Protocol.BatchServiceClient).Assembly.GetType("IProtocolLayer")], null) - .Invoke([protocolLayer]); + var parentClient = CreateMockBatchClient(); Microsoft.Azure.Batch.Protocol.Models.CloudPool modelPool = new( id: id, currentDedicatedNodes: currentDedicatedNodes, @@ -489,12 +484,7 @@ internal static CloudTask GenerateTask(string jobId, string id, DateTime stateTr stateTransitionTime = DateTime.UtcNow; } - Mock computeNodeOperations = new(); - MockServiceClient batchServiceClient = new(computeNodeOperations.Object); - var protocolLayer = typeof(Microsoft.Azure.Batch.Protocol.BatchServiceClient).Assembly.GetType("ProtocolLayer").GetConstructor(System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic, null, [typeof(Microsoft.Azure.Batch.Protocol.BatchServiceClient)], null) - .Invoke([batchServiceClient]); - var parentClient = (BatchClient)typeof(BatchClient).GetConstructor(System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic, null, [typeof(Microsoft.Azure.Batch.Protocol.BatchServiceClient).Assembly.GetType("IProtocolLayer")], null) - .Invoke([protocolLayer]); + var parentClient = CreateMockBatchClient(); Microsoft.Azure.Batch.Protocol.Models.CloudTask modelTask = new(id: id, stateTransitionTime: stateTransitionTime, state: Microsoft.Azure.Batch.Protocol.Models.TaskState.Active); var task = (CloudTask)typeof(CloudTask).GetConstructor(System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance, default, [typeof(BatchClient), typeof(string), typeof(Microsoft.Azure.Batch.Protocol.Models.CloudTask), typeof(IEnumerable)], default) .Invoke([parentClient, jobId, modelTask, Enumerable.Empty()]); @@ -508,16 +498,21 @@ internal static ComputeNode GenerateNode(string poolId, string id, bool isDedica stateTransitionTime = DateTime.UtcNow; } - Mock computeNodeOperations = new(); - MockServiceClient batchServiceClient = new(computeNodeOperations.Object); - var protocolLayer = typeof(Microsoft.Azure.Batch.Protocol.BatchServiceClient).Assembly.GetType("ProtocolLayer").GetConstructor(System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic, null, [typeof(Microsoft.Azure.Batch.Protocol.BatchServiceClient)], null) - .Invoke([batchServiceClient]); - var parentClient = (BatchClient)typeof(BatchClient).GetConstructor(System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic, null, [typeof(Microsoft.Azure.Batch.Protocol.BatchServiceClient).Assembly.GetType("IProtocolLayer")], null) - .Invoke([protocolLayer]); + var parentClient = CreateMockBatchClient(); Microsoft.Azure.Batch.Protocol.Models.ComputeNode modelNode = new(stateTransitionTime: stateTransitionTime, id: id, affinityId: AffinityPrefix + id, isDedicated: isDedicated, state: isIdle ? Microsoft.Azure.Batch.Protocol.Models.ComputeNodeState.Idle : Microsoft.Azure.Batch.Protocol.Models.ComputeNodeState.Running); var node = (ComputeNode)typeof(ComputeNode).GetConstructor(System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance, default, [typeof(BatchClient), typeof(string), typeof(Microsoft.Azure.Batch.Protocol.Models.ComputeNode), typeof(IEnumerable)], default) .Invoke([parentClient, poolId, modelNode, null]); return node; } + + private static BatchClient CreateMockBatchClient() + { + Mock computeNodeOperations = new(); + MockServiceClient batchServiceClient = new(computeNodeOperations.Object); + var protocolLayer = typeof(Microsoft.Azure.Batch.Protocol.BatchServiceClient).Assembly.GetType("Microsoft.Azure.Batch.ProtocolLayer").GetConstructor(System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic, null, [typeof(Microsoft.Azure.Batch.Protocol.BatchServiceClient)], null) + .Invoke([batchServiceClient]); + return (BatchClient)typeof(BatchClient).GetConstructor(System.Reflection.BindingFlags.Instance | System.Reflection.BindingFlags.NonPublic, null, [typeof(Microsoft.Azure.Batch.Protocol.BatchServiceClient).Assembly.GetType("Microsoft.Azure.Batch.IProtocolLayer")], null) + .Invoke([protocolLayer]); + } } } diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index d40626b19..bf799b184 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -12,7 +12,6 @@ using Azure.ResourceManager.Batch; using Azure.ResourceManager.Batch.Models; using Azure.Storage.Blobs; -using Azure.Storage.Blobs.Models; using Microsoft.Azure.Batch; using Microsoft.Azure.Batch.Common; using Microsoft.Extensions.DependencyInjection; @@ -745,7 +744,7 @@ public async Task BatchJobContainsExpectedBatchPoolInformation() GuardAssertsWithTesTask(tesTask, () => { - Assert.AreEqual("TES-hostname-edicated1-replaceme-", tesTask.PoolId[0..^8]); + Assert.AreEqual("TES-hostname-edicated1-woy4muc7mxr23jl23zwg4kn2ynz4kvzo-", tesTask.PoolId[0..^8]); Assert.AreEqual("VmSizeDedicated1", pool.VmSize); Assert.IsTrue(((BatchScheduler)batchScheduler).TryGetPool(tesTask.PoolId, out _)); }); @@ -1093,10 +1092,13 @@ public async Task SuccessfullyCompletedTaskContainsBatchNodeMetrics() DiskUsedInKiB=1000000".Replace(" ", string.Empty); var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - azureProxyReturnValues.BatchTaskState = BatchTaskStates.TaskCompletedSuccessfully[0]; azureProxyReturnValues.DownloadedBlobContent = metricsFileContent; - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), GetMockBatchPoolManager(azureProxyReturnValues), azureProxyReturnValues); + foreach (var batchTaskState in BatchTaskStates.TaskCompletedSuccessfully) + { + azureProxyReturnValues.BatchTaskState = batchTaskState; + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), GetMockBatchPoolManager(azureProxyReturnValues), azureProxyReturnValues); + } GuardAssertsWithTesTask(tesTask, () => { @@ -1128,12 +1130,15 @@ public async Task SuccessfullyCompletedTaskContainsCromwellResultCode() tesTask.State = TesState.INITIALIZING; var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - azureProxyReturnValues.BatchTaskState = BatchTaskStates.TaskCompletedSuccessfully[0]; azureProxyReturnValues.DownloadedBlobContent = "2"; var azureProxy = GetMockAzureProxy(azureProxyReturnValues); var batchPoolManager = GetMockBatchPoolManager(azureProxyReturnValues); - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxy, batchPoolManager, azureProxyReturnValues); + foreach (var batchTaskState in BatchTaskStates.TaskCompletedSuccessfully) + { + azureProxyReturnValues.BatchTaskState = batchTaskState; + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxy, batchPoolManager, azureProxyReturnValues); + } GuardAssertsWithTesTask(tesTask, () => { @@ -1143,52 +1148,6 @@ public async Task SuccessfullyCompletedTaskContainsCromwellResultCode() }); } - [DataTestMethod] - [DataRow(["task-executor-1_stdout_20241007203438616.txt", "task-executor-1_stderr_20241007203438616.txt"])] - [DataRow(["task-executor-1_stdout_20241007203438616.txt", "task-executor-1_stderr_20241007203438616.txt", "task-executor-1_stdout_20241007203438616_1.txt",])] - public async Task ExecutorLogsAreAddedToExecutorLog(IEnumerable logs) - { - List expectedStdErrLogs = []; - List expectedStdOutLogs = []; - var tesTask = GetTesTask(); - tesTask.State = TesState.RUNNING; - - var azureProxyReturnValues = AzureProxyReturnValues.Defaults; - azureProxyReturnValues.BatchTaskState = BatchTaskStates.TaskCompletedSuccessfully[0]; - var azureProxy = GetMockAzureProxy(azureProxyReturnValues); - var batchPoolManager = GetMockBatchPoolManager(azureProxyReturnValues); - - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), GetMockAzureProxy(azureProxyReturnValues), batchPoolManager, azureProxyReturnValues, serviceProviderActions: serviceProvider => - { - var storageAccessProvider = serviceProvider.GetServiceOrCreateInstance(); - var executionDirectoryUri = storageAccessProvider.GetInternalTesTaskBlobUrlAsync(tesTask, null, Azure.Storage.Sas.BlobSasPermissions.Read, CancellationToken.None).GetAwaiter().GetResult(); - - logs.Order().ForEach(log => - { - var uri = storageAccessProvider.GetInternalTesTaskBlobUrlWithoutSasToken(tesTask, log); - - if (log.Contains("stderr")) - { - expectedStdErrLogs.Add(uri); - } - else if (log.Contains("stdout")) - { - expectedStdOutLogs.Add(uri); - } - }); - - serviceProvider.AzureProxy.Setup(p => p.ListBlobsAsync(It.Is(executionDirectoryUri, new UrlMutableSASEqualityComparer()), It.IsAny())).Returns(expectedStdErrLogs.Concat(expectedStdOutLogs).OrderBy(uri => uri.AbsoluteUri).Select(log => new BlobNameAndUri(new BlobUriBuilder(log).BlobName, log)).ToAsyncEnumerable()); - }); - - GuardAssertsWithTesTask(tesTask, () => - { - Assert.IsTrue(expectedStdOutLogs.SequenceEqual(GetLogs(tesTask.Logs.LastOrDefault()?.Logs.FirstOrDefault()?.Stdout ?? string.Empty) ?? [])); - Assert.IsTrue(expectedStdErrLogs.SequenceEqual(GetLogs(tesTask.Logs.LastOrDefault()?.Logs.FirstOrDefault()?.Stderr ?? string.Empty) ?? [])); - }); - - static IEnumerable GetLogs(string logs) => logs is null ? null : System.Text.Json.JsonSerializer.Deserialize>(logs).Select(log => new Uri(log)); - } - [DataTestMethod] [DataRow(new string[] { null, "echo hello" }, "blob1.tmp", false, DisplayName = "commandScript via content")] [DataRow(new string[] { "https://defaultstorageaccount.blob.core.windows.net/cromwell-executions/workflow1/0fbdb535-4afd-45e3-a8a8-c8e50585ee4e/call-Task1/execution/script", null }, "blob1.tmp", false, DisplayName = "default url with file missing")] @@ -1717,7 +1676,11 @@ private struct BatchTaskStates public static AzureBatchTaskState[] TaskActive => [new AzureBatchTaskState(AzureBatchTaskState.TaskState.InfoUpdate)]; public static AzureBatchTaskState[] TaskPreparing => [new AzureBatchTaskState(AzureBatchTaskState.TaskState.Initializing, CloudTaskCreationTime: DateTimeOffset.UtcNow)]; public static AzureBatchTaskState[] TaskRunning => [new AzureBatchTaskState(AzureBatchTaskState.TaskState.Running, CloudTaskCreationTime: DateTimeOffset.UtcNow - TimeSpan.FromMinutes(6))]; - public static AzureBatchTaskState[] TaskCompletedSuccessfully => [new AzureBatchTaskState(AzureBatchTaskState.TaskState.CompletedSuccessfully, BatchTaskExitCode: 0)]; + public static AzureBatchTaskState[] TaskCompletedSuccessfully => + [ + new AzureBatchTaskState(AzureBatchTaskState.TaskState.InfoUpdate, ExecutorExitCode: 0), + new AzureBatchTaskState(AzureBatchTaskState.TaskState.CompletedSuccessfully, BatchTaskExitCode: 0), + ]; public static AzureBatchTaskState[] TaskFailed => [ new AzureBatchTaskState(AzureBatchTaskState.TaskState.InfoUpdate, Failure: new(AzureBatchTaskState.ExecutorError, [TaskFailureInformationCodes.FailureExitCode, @"1"]), ExecutorExitCode: 1), From 7b161ba227234ce8b113fc84180b6e616c1b9484 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 9 Oct 2024 14:20:30 -0700 Subject: [PATCH 145/202] fix batch task environment --- .../Runner/TaskToNodeTaskConverter.cs | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs b/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs index f4f0e9e27..dd3b6a838 100644 --- a/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs +++ b/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs @@ -88,9 +88,6 @@ public virtual NodeTaskResolverOptions ToNodeTaskResolverOptions(TesTask task, N try { var builder = new NodeTaskBuilder(); - builder.WithAzureCloudIdentityConfig(azureCloudIdentityConfig) - .WithStorageEventSink(storageAccessProvider.GetInternalTesBlobUrlWithoutSasToken(blobPath: string.Empty)) - .WithResourceIdManagedIdentity(GetNodeManagedIdentityResourceId(nodeTaskConversionOptions.GlobalManagedIdentity, task)); if (terraOptions is not null && !string.IsNullOrEmpty(terraOptions.WsmApiHost)) { @@ -99,6 +96,12 @@ public virtual NodeTaskResolverOptions ToNodeTaskResolverOptions(TesTask task, N terraOptions.SasAllowedIpRange); } + builder.WithAzureCloudIdentityConfig(azureCloudIdentityConfig) + .WithContainerMountParentDirectory(containerMountParentDirectory) + .WithStorageEventSink(storageAccessProvider.GetInternalTesTaskBlobUrlWithoutSasToken(task, blobPath: string.Empty)) + .WithLogPublisher(storageAccessProvider.GetInternalTesTaskBlobUrlWithoutSasToken(task, blobPath: string.Empty)) + .WithResourceIdManagedIdentity(GetNodeManagedIdentityResourceId(nodeTaskConversionOptions.GlobalManagedIdentity, task)); + var runtimeOptions = builder.Build().RuntimeOptions; runtimeOptions.StorageEventSink.TargetUrl = default; @@ -129,6 +132,13 @@ public virtual async Task ToNodeTaskAsync(TesTask task, NodeTaskConver { var builder = new NodeTaskBuilder(); + if (terraOptions is not null && !string.IsNullOrEmpty(terraOptions.WsmApiHost)) + { + logger.LogInformation("Setting up Terra as the runtime environment for the runner"); + builder.WithTerraAsRuntimeEnvironment(terraOptions.WsmApiHost, terraOptions.LandingZoneApiHost, + terraOptions.SasAllowedIpRange); + } + //TODO: Revise this assumption (carried over from the current implementation) and consider Single() if in practice only one executor per task is supported. var executor = task.Executors.First(); @@ -156,13 +166,6 @@ public virtual async Task ToNodeTaskAsync(TesTask task, NodeTaskConver break; } - if (terraOptions is not null && !string.IsNullOrEmpty(terraOptions.WsmApiHost)) - { - logger.LogInformation("Setting up Terra as the runtime environment for the runner"); - builder.WithTerraAsRuntimeEnvironment(terraOptions.WsmApiHost, terraOptions.LandingZoneApiHost, - terraOptions.SasAllowedIpRange); - } - await BuildInputsAsync(task, builder, nodeTaskConversionOptions.AdditionalInputs, nodeTaskConversionOptions.DefaultStorageAccountName, cancellationToken); BuildOutputs(task, nodeTaskConversionOptions.DefaultStorageAccountName, builder); From 98baf8a674d54c7589c6f56c243d07441e81e230 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Sat, 12 Oct 2024 00:36:27 -0700 Subject: [PATCH 146/202] minor refactoring of task initialization and pool verification and creation --- src/Tes/Extensions/TesTaskExtensions.cs | 19 +- .../Repository/PostgreSqlCachingRepository.cs | 5 +- src/TesApi.Tests/BatchSchedulerTests.cs | 10 +- src/TesApi.Web/BatchPools.BatchScheduler.cs | 43 +-- src/TesApi.Web/BatchScheduler.cs | 263 ++++++++++++------ src/TesApi.Web/IBatchScheduler.cs | 11 +- .../OrchestrateOnBatchSchedulerServiceBase.cs | 10 +- src/TesApi.Web/PoolScheduler.cs | 11 +- src/TesApi.Web/TaskScheduler.cs | 46 ++- 9 files changed, 283 insertions(+), 135 deletions(-) diff --git a/src/Tes/Extensions/TesTaskExtensions.cs b/src/Tes/Extensions/TesTaskExtensions.cs index 6ff3a5010..4e5a7d009 100644 --- a/src/Tes/Extensions/TesTaskExtensions.cs +++ b/src/Tes/Extensions/TesTaskExtensions.cs @@ -115,9 +115,11 @@ public static void SetWarning(this TesTask tesTask, string warning, params strin /// Last public static TesTaskLog GetOrAddTesTaskLog(this TesTask tesTask) { - if (tesTask.Logs is null || !tesTask.Logs.Any()) + if ((tesTask.Logs?.Count ?? 0) == 0) { - tesTask.Logs = [new()]; + TesTaskLog log = new(); + tesTask.Logs = [log]; + return log; } return tesTask.Logs.Last(); @@ -130,10 +132,11 @@ public static TesTaskLog GetOrAddTesTaskLog(this TesTask tesTask) /// Last public static TesTaskLog AddTesTaskLog(this TesTask tesTask) { + TesTaskLog log = new(); tesTask.Logs ??= []; - tesTask.Logs.Add(new()); + tesTask.Logs.Add(log); - return tesTask.Logs.Last(); + return log; } /// @@ -150,7 +153,7 @@ public static BatchNodeMetrics GetOrAddBatchNodeMetrics(this TesTaskLog tesTaskL /// /// Initialized Metadata property public static Dictionary GetOrAddMetadata(this TesTaskLog tesTaskLog) - => tesTaskLog.Metadata ??= new(); + => tesTaskLog.Metadata ??= []; /// /// Returns the last . Adds it if none exist. @@ -159,9 +162,11 @@ public static Dictionary GetOrAddMetadata(this TesTaskLog tesTas /// Initialized public static TesExecutorLog GetOrAddExecutorLog(this TesTaskLog tesTaskLog) { - if (tesTaskLog.Logs is null || !tesTaskLog.Logs.Any()) + if ((tesTaskLog.Logs?.Count ?? 0) == 0) { - tesTaskLog.Logs = [new()]; + TesExecutorLog log = new(); + tesTaskLog.Logs = [log]; + return log; } return tesTaskLog.Logs.Last(); diff --git a/src/Tes/Repository/PostgreSqlCachingRepository.cs b/src/Tes/Repository/PostgreSqlCachingRepository.cs index 886707ce4..660d3d7a2 100644 --- a/src/Tes/Repository/PostgreSqlCachingRepository.cs +++ b/src/Tes/Repository/PostgreSqlCachingRepository.cs @@ -30,7 +30,7 @@ public abstract class PostgreSqlCachingRepository : IDisposable .WaitAndRetryAsync(10, i => TimeSpan.FromSeconds(Math.Pow(2, i))); private record struct WriteItem(TDbItem DbItem, WriteAction Action, TaskCompletionSource TaskSource); - private readonly Channel itemsToWrite = Channel.CreateUnbounded(); + private readonly Channel itemsToWrite = Channel.CreateUnbounded(new() { SingleReader = true }); private readonly ConcurrentDictionary updatingItems = new(); // Collection of all pending updates to be written, to faciliate detection of simultaneous parallel updates. private readonly CancellationTokenSource writerWorkerCancellationTokenSource = new(); private readonly Task writerWorkerTask; @@ -108,7 +108,7 @@ protected TResult EnsureActiveItemInCache(TDbItem item, FuncThe of to query. /// A for controlling the lifetime of the asynchronous operation. /// order-by function. - /// pagination selection (within the order-by). + /// pagination selection (within ). /// The WHERE clause parts for selection in the query. /// The WHERE clause for raw SQL for selection in the query. /// @@ -246,6 +246,7 @@ protected virtual void Dispose(bool disposing) { if (disposing) { + //_ = itemsToWrite.Writer.TryComplete(); writerWorkerCancellationTokenSource.Cancel(); try diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index bf799b184..4d60162e5 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -735,7 +735,7 @@ public async Task BatchJobContainsExpectedBatchPoolInformation() var batchScheduler = serviceProvider.GetT(); { - await using var schedulerBackgroundTasks = new PerformBatchSchedulerBackgroundTasks(batchScheduler); + await using PerformBatchSchedulerBackgroundTasks _1 = new(batchScheduler); _ = await batchScheduler.ProcessQueuedTesTaskAsync(tesTask, CancellationToken.None); } @@ -1358,7 +1358,7 @@ public async Task PoolIsCreatedWithoutPublicIpWhenSubnetAndDisableBatchNodesPubl if (azureProxyReturnValues.BatchTaskState is null) { - await using PerformBatchSchedulerBackgroundTasks schedulerBackgroundTasks = new(batchScheduler); + await using PerformBatchSchedulerBackgroundTasks _1 = new(batchScheduler); await Parallel.ForEachAsync(tesTasks, async (task, token) => _ = await batchScheduler.ProcessQueuedTesTaskAsync(task, token)); } else @@ -1652,7 +1652,9 @@ private readonly async Task RepeatedlyCallPerformBackgroundTasksAsync() { try { - await batchScheduler.PerformBackgroundTasksAsync(cancellationToken.Token); + await batchScheduler.PerformShortBackgroundTasksAsync(CancellationToken.None); + await Task.WhenAll(batchScheduler.PerformLongBackgroundTasksAsync(CancellationToken.None).ToBlockingEnumerable(CancellationToken.None)); + await Task.WhenAll(batchScheduler.PerformLongBackgroundTasksAsync(CancellationToken.None).ToBlockingEnumerable(CancellationToken.None)); await timer.WaitForNextTickAsync(cancellationToken.Token); } catch (OperationCanceledException) @@ -1662,7 +1664,7 @@ private readonly async Task RepeatedlyCallPerformBackgroundTasksAsync() } } - public readonly async ValueTask DisposeAsync() + readonly async ValueTask IAsyncDisposable.DisposeAsync() { cancellationToken.Cancel(); await task; diff --git a/src/TesApi.Web/BatchPools.BatchScheduler.cs b/src/TesApi.Web/BatchPools.BatchScheduler.cs index 1a5ababc7..d33b04872 100644 --- a/src/TesApi.Web/BatchPools.BatchScheduler.cs +++ b/src/TesApi.Web/BatchPools.BatchScheduler.cs @@ -26,10 +26,10 @@ public partial class BatchScheduler internal delegate ValueTask ModelPoolFactory(string poolId, CancellationToken cancellationToken); - private (string PoolKey, string DisplayName) GetPoolKey(Tes.Models.TesTask tesTask, VirtualMachineInformationWithDataDisks virtualMachineInformation, List identities) + private string GetPoolKey(Tes.Models.TesTask tesTask, ref VirtualMachineInformationWithDataDisks virtualMachineInformation) { - var identityResourceIds = identities is not null && identities.Count > 0 - ? string.Join(";", identities) + var identityResourceIds = virtualMachineInformation.Identities is not null && virtualMachineInformation.Identities.Count > 0 + ? string.Join(";", virtualMachineInformation.Identities) : ""; var executorImage = tesTask.Executors.First().Image; @@ -56,7 +56,7 @@ public partial class BatchScheduler if (displayName.Length > 1024) { // Remove "paths" of identityResourceId - displayName = displayName[..^identityResourceIds.Length] + string.Join(";", identities.Select(x => x[(x.LastIndexOf('/') + 1)..])); + displayName = displayName[..^identityResourceIds.Length] + string.Join(";", virtualMachineInformation.Identities.Select(x => x[(x.LastIndexOf('/') + 1)..])); if (displayName.Length > 1024) { @@ -65,7 +65,8 @@ public partial class BatchScheduler } } - return (name, displayName); + virtualMachineInformation.PoolDisplayName = displayName; + return name; static string LimitVmSize(string vmSize, int limit) { @@ -249,6 +250,8 @@ internal IEnumerable GetPoolGroupKeys() internal sealed class BatchPools : KeyedCollection { + private readonly object _lock = new(); // TODO: replace object with Lock in .NET 9/10 + public BatchPools() : base(StringComparer.OrdinalIgnoreCase) { } @@ -267,9 +270,12 @@ public IBatchPool GetPoolOrDefault(string poolId) public bool Add(IBatchPool pool) { - return TryGetValue(GetKeyForItem(pool), out var poolSet) - ? poolSet.Add(pool) - : AddSet(); + lock (_lock) + { + return TryGetValue(GetKeyForItem(pool), out var poolSet) + ? poolSet.Add(pool) + : AddSet(); + } bool AddSet() { @@ -280,23 +286,26 @@ bool AddSet() public bool Remove(IBatchPool pool) { - if (TryGetValue(GetKeyForItem(pool), out var poolSet)) + lock (_lock) { - if (poolSet.Remove(pool)) + if (TryGetValue(GetKeyForItem(pool), out var poolSet)) { - if (0 == poolSet.Count) + if (poolSet.Remove(pool)) { - if (!Remove(poolSet)) + if (0 == poolSet.Count) { - throw new InvalidOperationException(); + if (!Remove(poolSet)) + { + throw new InvalidOperationException(); + } } - } - return true; + return true; + } } - } - return false; + return false; + } } internal IEnumerable GetPoolKeys() diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 0943a94c7..e9ee20af6 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -547,12 +547,7 @@ public string GetTesTaskIdFromCloudTaskId(string cloudTaskId) return separatorIndex == -1 ? cloudTaskId : cloudTaskId[..separatorIndex]; } - // Collections and records managing the processing of TesTasks in Queued status - private record struct PendingCloudTask(CloudTask CloudTask, TaskCompletionSource TaskCompletion); - private record struct PendingPoolRequest(string PoolKey, VirtualMachineInformationWithDataDisks VirtualMachineInfo, IList Identities, string PoolDisplayName, TaskCompletionSource TaskCompletion); - private record struct PendingPool(string PoolKey, VirtualMachineInformationWithDataDisks VirtualMachineInfo, IList Identities, string PoolDisplayName, int InitialTarget, IEnumerable> TaskCompletions); - private record struct ImmutableQueueWithTimer(Timer Timer, ImmutableQueue Queue); - + // Collections managing the processing of TesTasks in Queued status private readonly ConcurrentDictionary> _queuedTesTaskPendingTasksByJob = new(); private readonly ConcurrentDictionary> _queuedTesTaskPendingPoolsByKey = new(); private readonly ConcurrentQueue<(string JobId, IList Tasks)> _queuedTesTaskPendingJobBatches = new(); @@ -591,17 +586,23 @@ public async Task ProcessQueuedTesTaskAsync(TesTask tesTask, CancellationT logger.LogDebug(@"Checking quota for {TesTask}.", tesTask.Id); var virtualMachineInfo = await GetVmSizeAsync(tesTask, cancellationToken); - (poolKey, var displayName) = GetPoolKey(tesTask, virtualMachineInfo, identities); + virtualMachineInfo.Identities = identities; + poolKey = GetPoolKey(tesTask, ref virtualMachineInfo); + await quotaVerifier.CheckBatchAccountQuotasAsync(virtualMachineInfo.VM, needPoolOrJobQuotaCheck: !IsPoolAvailable(poolKey), cancellationToken: cancellationToken); // double await because the method call returns a System.Task. When that Task returns, the TesTask has been queued to a job and a pool exists to run that job's tasks - await await AttachQueuedTesTaskToBatchPoolAsync(poolKey, tesTask, virtualMachineInfo, identities, displayName, cancellationToken); + await await AttachQueuedTesTaskToBatchPoolAsync(poolKey, tesTask, virtualMachineInfo, cancellationToken); var tesTaskLog = tesTask.GetOrAddTesTaskLog(); tesTaskLog.StartTime = DateTimeOffset.UtcNow; tesTask.State = TesState.INITIALIZING; return true; } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + throw; + } catch (AggregateException aggregateException) { var result = false; @@ -644,7 +645,7 @@ public async Task ProcessQueuedTesTaskAsync(TesTask tesTask, CancellationT } } - private async ValueTask AttachQueuedTesTaskToBatchPoolAsync(string poolKey, TesTask tesTask, VirtualMachineInformationWithDataDisks virtualMachineInfo, IList identities, string poolDisplayName, CancellationToken cancellationToken) + private async ValueTask AttachQueuedTesTaskToBatchPoolAsync(string poolKey, TesTask tesTask, VirtualMachineInformationWithDataDisks virtualMachineInfo, CancellationToken cancellationToken) { TaskCompletionSource taskCompletion = new(); // This provides the System.Task this method returns @@ -657,9 +658,9 @@ private async ValueTask AttachQueuedTesTaskToBatchPoolAsync(string poolKey TaskCompletionSource poolCompletion = new(); // This provides the poolId of the pool provided for the task AddTValueToCollectorQueue( key: poolKey, - value: new PendingPoolRequest(poolKey, virtualMachineInfo, identities, poolDisplayName, poolCompletion), + value: new PendingPoolRequest(poolKey, virtualMachineInfo, poolCompletion), dictionary: _queuedTesTaskPendingPoolsByKey, - enqueue: (key, tasks) => _queuedTesTaskPendingPoolQuotas.Enqueue(new(key, tasks.First().VirtualMachineInfo, tasks.First().Identities, tasks.First().PoolDisplayName, tasks.Count, tasks.Select(t => t.TaskCompletion))), + enqueue: (key, tasks) => _queuedTesTaskPendingPoolQuotas.Enqueue(new(key, tasks.First().VirtualMachineInfo, tasks.Count, tasks.Select(t => t.TaskCompletion))), groupGatherWindow: QueuedTesTaskPoolGroupGatherWindow, maxCount: int.MaxValue); @@ -675,7 +676,7 @@ private async ValueTask AttachQueuedTesTaskToBatchPoolAsync(string poolKey tesTaskLog.VirtualMachineInfo = virtualMachineInfo.VM; var cloudTaskId = $"{tesTask.Id}-{tesTask.Logs.Count}"; tesTask.PoolId = pool.PoolId; - var cloudTask = await ConvertTesTaskToBatchTaskUsingRunnerAsync(cloudTaskId, tesTask, identities.Last(), virtualMachineInfo.VM.VmFamily, cancellationToken); + var cloudTask = await ConvertTesTaskToBatchTaskUsingRunnerAsync(cloudTaskId, tesTask, virtualMachineInfo.Identities.Last(), virtualMachineInfo.VM.VmFamily, cancellationToken); logger.LogInformation(@"Creating batch task for TES task {TesTaskId}. Using VM size {VmSize}.", tesTask.Id, virtualMachineInfo.VM.VmSize); @@ -687,9 +688,13 @@ private async ValueTask AttachQueuedTesTaskToBatchPoolAsync(string poolKey groupGatherWindow: QueuedTesTaskTaskGroupGatherWindow, maxCount: 100); } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + _ = taskCompletion.TrySetCanceled(cancellationToken); + } catch (Exception exception) { - taskCompletion.SetException(exception); + _ = taskCompletion.TrySetException(exception); } return taskCompletion.Task; @@ -757,18 +762,16 @@ private static void QueuedTesTaskAddTaskEntryToQueueFromDirectory( tasks.Add(task); } - enqueue(key, tasks); - - // Remove enqueued entries from directory without leaving empty entries. This is a loop because we are using ConcurrentDirectory - for (; - !(queue.IsEmpty switch + for (enqueue(key, tasks); // Add tasks to the outside "queue" (this happens only once) + // Remove enqueued entries from directory without leaving empty entries. This is a loop because we are using ConcurrentDirectory + !(queue.IsEmpty switch // The for loop ends when TryRemove or TryUpdate return true. { - true => dictionary.TryRemove(new(key, refValue)), - false => dictionary.TryUpdate(key, new(timer, queue), refValue), + true => dictionary.TryRemove(new(key, refValue)), // Attempt queue removal from dictionary + false => dictionary.TryUpdate(key, new(timer, queue), refValue), // Attempt dictionary update with items remaining in the queue }); - queue = ImmutableQueue.CreateRange(refValue.Queue.WhereNot(tasks.Contains))) + queue = ImmutableQueue.CreateRange(refValue.Queue.WhereNot(tasks.Contains))) // get the list of items that were not "enqueued" { - refValue = dictionary[key]; + refValue = dictionary[key]; // get the current dictionary value for the key } if (queue.IsEmpty) @@ -784,7 +787,7 @@ private static void QueuedTesTaskAddTaskEntryToQueueFromDirectory( } /// - public async ValueTask PerformBackgroundTasksAsync(CancellationToken cancellationToken) + public async ValueTask PerformShortBackgroundTasksAsync(CancellationToken cancellationToken) { // Add a batch of tasks to a job if (_queuedTesTaskPendingJobBatches.TryDequeue(out var jobBatch)) @@ -796,69 +799,168 @@ await PerformTaskAsync( taskCompletions: tasks.Select(task => task.TaskCompletion), cancellationToken: cancellationToken); } + } - // Apply Pool and Job Quota limits + private static async Task PerformTaskAsync(Func method, IEnumerable taskCompletions, CancellationToken cancellationToken) + { + try + { + await method(cancellationToken); + taskCompletions?.ForEach(completion => completion.SetResult()); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + taskCompletions?.ForEach(completion => _ = completion.TrySetCanceled(cancellationToken)); + throw; + } + catch (Exception exception) { - Dictionary pools = []; + taskCompletions = (taskCompletions ?? []).ToList(); - while (_queuedTesTaskPendingPoolQuotas.TryDequeue(out var pendingPool)) + if (taskCompletions.Any()) { - pools.Add(pendingPool.PoolKey, pendingPool); + taskCompletions.ForEach(completion => _ = completion.TrySetException(new AggregateException(Enumerable.Empty().Append(exception)))); } - - if (pools.Count != 0) + else { - // Determine how many new pools/jobs we need now - var requiredNewPools = pools.Keys.WhereNot(IsPoolAvailable).ToList(); + throw; + } + } + } - // Revisit pool/job quotas (the task quota analysis already dealt with the possibility of needing just one more pool or job). - if (requiredNewPools.Skip(1).Any()) - { - // This will remove pool keys we cannot accommodate due to quota, along with all of their associated tasks, from being queued into Batch. - logger.LogDebug(@"Checking pools and jobs quota to accommodate {NeededPools} additional pools.", requiredNewPools.Count); + private static async Task PerformTaskOfTAsync(Func> method, IEnumerable> taskCompletions, CancellationToken cancellationToken) + { + try + { + var result = await method(cancellationToken); + taskCompletions?.ForEach(completion => completion.SetResult(result)); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + taskCompletions?.ForEach(completion => _ = completion.TrySetCanceled(cancellationToken)); + throw; + } + catch (Exception exception) + { + taskCompletions = (taskCompletions ?? []).ToList(); + + if (taskCompletions.Any()) + { + taskCompletions.ForEach(completion => _ = completion.TrySetException(new AggregateException(Enumerable.Empty().Append(exception)))); + } + else + { + throw; + } + } + } - var (exceededQuantity, exception) = await quotaVerifier.CheckBatchAccountPoolAndJobQuotasAsync(requiredNewPools.Count, cancellationToken); + /// + public IAsyncEnumerable PerformLongBackgroundTasksAsync(CancellationToken cancellationToken) + { + List tasks = []; - foreach (var task in ((IEnumerable)requiredNewPools) - .Reverse() - .SelectWhere(TryRemovePool) - .Take(exceededQuantity) - .SelectMany(t => t.TaskCompletions)) - { - task.SetException(exception); - } + // Apply Pool and Job Quota limits + { + Dictionary> poolsByKey = []; - bool TryRemovePool(string key, out PendingPool result) - { - logger.LogDebug(@"Due to quotas, unable to accommodate {PoolKey} batch pools.", key); - result = pools[key]; - pools.Remove(key); - return true; - } + while (_queuedTesTaskPendingPoolQuotas.TryDequeue(out var pendingPool)) + { + if (!poolsByKey.TryAdd(pendingPool.PoolKey, [pendingPool])) + { + poolsByKey[pendingPool.PoolKey].Add(pendingPool); } + } - logger.LogDebug(@"Obtaining {NewPools} batch pools.", pools.Count); + var existingPools = poolsByKey.Keys.Where(IsPoolAvailable).ToList(); - foreach (var poolToCreate in pools) + foreach (var key in existingPools) + { + if (poolsByKey.Remove(key, out var pools)) { - _queuedTesTaskPendingPools.Enqueue(poolToCreate.Value); + pools.ForEach(_queuedTesTaskPendingPools.Enqueue); } } + + // Determine how many new pools/jobs we need now (the task quota analysis already dealt with the possibility of needing just one more pool or job) + switch (poolsByKey.Count) + { + case 0: + break; + + case 1: + poolsByKey.First().Value.ForEach(_queuedTesTaskPendingPools.Enqueue); + break; + + default: + // Revisit pool/job quotas. + tasks.Add(PerformTaskAsync( + method: async token => + { + // This will remove pool keys we cannot accommodate due to quota, along with all of their associated tasks, from being queued into Batch. + logger.LogDebug(@"Checking pools and jobs quota to accommodate {NeededPools} additional pools.", poolsByKey.Count); + + var (exceededQuantity, exception) = await quotaVerifier.CheckBatchAccountPoolAndJobQuotasAsync(poolsByKey.Count, token); + + foreach (var task in poolsByKey.Keys + .Reverse() + .SelectWhere>(TryRemovePool) + .Take(exceededQuantity) + .SelectMany(pools => pools) + .SelectMany(pool => pool.TaskCompletions)) + { + _ = task.TrySetException(exception); + } + + bool TryRemovePool(string key, out IEnumerable result) + { + logger.LogDebug(@"Due to quotas, unable to accommodate adding batch pool for key: {PoolKey}.", key); + result = poolsByKey[key]; + poolsByKey.Remove(key); + return true; + } + + logger.LogDebug(@"Obtaining {NewPools} batch pools.", poolsByKey.Count); + + foreach (var poolToCreate in poolsByKey.Values.SelectMany(pools => pools)) + { + _queuedTesTaskPendingPools.Enqueue(poolToCreate); + } + }, + taskCompletions: default, + cancellationToken: cancellationToken) + .ContinueWith(task => + { + var tasks = poolsByKey.Values.SelectMany(pools => pools.Select(pool => pool.TaskCompletions)).SelectMany(source => source); + + if (task.IsFaulted) + { + var exception = task.Exception; + tasks.ForEach(task => _ = task.TrySetException(exception)); + } + else // task is cancelled + { + tasks.ForEach(task => _ = task.TrySetCanceled()); + } + }, + TaskContinuationOptions.NotOnRanToCompletion | TaskContinuationOptions.ExecuteSynchronously)); + break; + } } - // Create a batch pool - if (_queuedTesTaskPendingPools.TryDequeue(out var pool)) + // Create batch pools + while (_queuedTesTaskPendingPools.TryDequeue(out var pool)) { logger.LogDebug(@"Creating pool for {PoolKey}.", pool.PoolKey); var useGen2 = pool.VirtualMachineInfo.VM.HyperVGenerations?.Contains("V2", StringComparer.OrdinalIgnoreCase); - await PerformTaskOfTAsync( + tasks.Add(PerformTaskOfTAsync( method: async token => (await GetOrAddPoolAsync( key: pool.PoolKey, isPreemptable: pool.VirtualMachineInfo.VM.LowPriority, modelPoolFactory: async (id, ct) => await GetPoolSpecification( name: id, - displayName: pool.PoolDisplayName, - poolIdentity: GetBatchPoolIdentity(pool.Identities.WhereNot(string.IsNullOrWhiteSpace).ToList()), + displayName: pool.VirtualMachineInfo.PoolDisplayName, + poolIdentity: GetBatchPoolIdentity(pool.VirtualMachineInfo.Identities.WhereNot(string.IsNullOrWhiteSpace).ToList()), vmInfo: pool.VirtualMachineInfo, initialTarget: pool.InitialTarget, nodeInfo: (useGen2 ?? false) ? gen2BatchNodeInfo : gen1BatchNodeInfo, @@ -866,34 +968,10 @@ await PerformTaskOfTAsync( cancellationToken: token)) .PoolId, taskCompletions: pool.TaskCompletions, - cancellationToken: cancellationToken); - } - - async static ValueTask PerformTaskAsync(Func method, IEnumerable taskCompletions, CancellationToken cancellationToken) - { - try - { - await method(cancellationToken); - taskCompletions.ForEach(completion => completion.SetResult()); - } - catch (Exception exception) - { - taskCompletions.ForEach(completion => completion.SetException(new AggregateException(Enumerable.Empty().Append(exception)))); - } + cancellationToken: cancellationToken)); } - async static ValueTask PerformTaskOfTAsync(Func> method, IEnumerable> taskCompletions, CancellationToken cancellationToken) - { - try - { - var result = await method(cancellationToken); - taskCompletions.ForEach(completion => completion.SetResult(result)); - } - catch (Exception exception) - { - taskCompletions.ForEach(completion => completion.SetException(new AggregateException(Enumerable.Empty().Append(exception)))); - } - } + return tasks.Where(task => !task.IsCanceled).WhenEach(cancellationToken).Where(task => !task.IsCanceled); } Task QueuedTesTaskHandleExceptionAsync(Exception exception, string poolKey, TesTask tesTask) @@ -1738,8 +1816,21 @@ public CombinedBatchTaskInfo(AzureBatchTaskState state, string alternateSystemLo public string AlternateSystemLogItem { get; set; } } + // Records managing the processing of TesTasks in Queued status + private record struct PendingCloudTask(CloudTask CloudTask, TaskCompletionSource TaskCompletion); + private record struct PendingPoolRequest(string PoolKey, VirtualMachineInformationWithDataDisks VirtualMachineInfo, TaskCompletionSource TaskCompletion); + private record struct PendingPool(string PoolKey, VirtualMachineInformationWithDataDisks VirtualMachineInfo, int InitialTarget, IEnumerable> TaskCompletions); + private record struct ImmutableQueueWithTimer(Timer Timer, ImmutableQueue Queue); + internal record class VirtualMachineInformationWithDataDisks(VirtualMachineInformation VM, IList DataDisks) { + /// + /// Azure User Assigned Managed Identities + /// + public IList Identities { get; set; } + + public string PoolDisplayName { get; set; } + /// /// Converts to . /// diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index a6c088974..c306d5524 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -113,11 +113,18 @@ public interface IBatchScheduler IAsyncEnumerable GetEventMessagesAsync(CancellationToken cancellationToken, string @event = default); /// - /// Performs background tasks. + /// Performs background tasks expected to complete in less than a second in aggregate. /// /// A for controlling the lifetime of the asynchronous operation. /// - ValueTask PerformBackgroundTasksAsync(CancellationToken cancellationToken); + ValueTask PerformShortBackgroundTasksAsync(CancellationToken cancellationToken); + + /// + /// Performs background tasks expected to take longer than a second. + /// + /// A for controlling the lifetime of the asynchronous operation. + /// + IAsyncEnumerable PerformLongBackgroundTasksAsync(CancellationToken cancellationToken); /// /// Identifies an azure cloud task. diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index e4448e26b..1e11d238e 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -101,7 +101,7 @@ protected virtual void ExecuteSetup(CancellationToken cancellationToken) { } /// Action to repeatedly run. /// Triggered when is called. /// A that represents this method's operations. - protected async ValueTask ExecuteActionOnIntervalAsync(TimeSpan runInterval, Func action, CancellationToken cancellationToken) + protected async Task ExecuteActionOnIntervalAsync(TimeSpan runInterval, Func action, CancellationToken cancellationToken) { ArgumentNullException.ThrowIfNull(action); @@ -149,6 +149,10 @@ protected async ValueTask ProcessOrchestratedTesTaskAsync(string pollName, Relat { isModified = await task; } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + throw; + } catch (Exception exc) { if (++tesTask.ErrorCount > 3 || // TODO: Should we increment this for exceptions here (current behavior) or the attempted executions on the batch? @@ -272,6 +276,10 @@ protected async ValueTask ProcessOrchestratedTesTaskAsync(string pollName, Relat Logger.LogError(exc, "Updating TES Task '{TesTask}' threw {ExceptionType}: '{ExceptionMessage}'. Stack trace: {ExceptionStackTrace}", tesTask.Id, exc.GetType().FullName, exc.Message, exc.StackTrace); } } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + throw; + } catch (Exception exc) { Logger.LogError(exc, "Updating TES Task '{TesTask}' threw {ExceptionType}: '{ExceptionMessage}'. Stack trace: {ExceptionStackTrace}", tesTask.Id, exc.GetType().FullName, exc.Message, exc.StackTrace); diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index c2a0ee7d4..c4941c520 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -59,9 +59,9 @@ protected override void ExecuteSetup(CancellationToken cancellationToken) } /// - protected override ValueTask ExecuteCoreAsync(CancellationToken cancellationToken) + protected override async ValueTask ExecuteCoreAsync(CancellationToken cancellationToken) { - return ExecuteActionOnIntervalAsync( + await ExecuteActionOnIntervalAsync( RunInterval, async token => await ExecuteActionOnPoolsAsync( async (pool, token) => @@ -98,6 +98,10 @@ await Parallel.ForEachAsync(pools, cancellationToken, async (pool, token) => { await action(pool, token); } + catch (OperationCanceledException) when (token.IsCancellationRequested) + { + throw; + } catch (Exception exc) { Logger.LogError(exc, @"Batch pool {PoolId} threw an exception when serviced.", pool.PoolId); @@ -353,7 +357,7 @@ static string FormatNameValuePair(NameValuePair pair) static IEnumerable RepeatUntil(Func func, Predicate stop) { - do + while (true) { var t = func(); @@ -364,7 +368,6 @@ static IEnumerable RepeatUntil(Func func, Predicate stop) yield return t; } - while (true); } } diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index 4048659cc..c581bf586 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -30,7 +30,8 @@ internal class TaskScheduler(RunnerEventsProcessor nodeEventProcessor, Microsoft { private static readonly TimeSpan blobRunInterval = TimeSpan.FromSeconds(15); internal static readonly TimeSpan BatchRunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs appears to use a 30 second polling interval - private static readonly TimeSpan backgroundRunInterval = TimeSpan.FromSeconds(1); + private static readonly TimeSpan shortBackgroundRunInterval = TimeSpan.FromSeconds(1); + private static readonly TimeSpan longBackgroundRunInterval = TimeSpan.FromSeconds(1); private readonly RunnerEventsProcessor nodeEventProcessor = nodeEventProcessor; /// @@ -52,7 +53,8 @@ protected override async ValueTask ExecuteSetupAsync(CancellationToken cancellat protected override async ValueTask ExecuteCoreAsync(CancellationToken cancellationToken) { await Task.WhenAll( - ExecuteBackgroundTasksAsync(cancellationToken), + ExecuteShortBackgroundTasksAsync(cancellationToken), + ExecuteLongBackgroundTasksAsync(cancellationToken), ExecuteCancelledTesTasksOnBatchAsync(cancellationToken), ExecuteQueuedTesTasksOnBatchAsync(cancellationToken), ExecuteUpdateTesTaskFromEventBlobAsync(cancellationToken)); @@ -62,15 +64,15 @@ await Task.WhenAll( /// Retrieves all queued TES tasks from the database, performs an action in the batch system, and updates the resultant state /// /// - private async Task ExecuteQueuedTesTasksOnBatchAsync(CancellationToken cancellationToken) + private Task ExecuteQueuedTesTasksOnBatchAsync(CancellationToken cancellationToken) { - var query = new Func>>( + Func>> query = new( async token => (await Repository.GetItemsAsync( predicate: t => t.State == TesState.QUEUED, cancellationToken: token)) .OrderBy(t => t.CreationTime)); - await ExecuteActionOnIntervalAsync(BatchRunInterval, + return ExecuteActionOnIntervalAsync(BatchRunInterval, async cancellation => { await Parallel.ForEachAsync( @@ -88,9 +90,21 @@ await Parallel.ForEachAsync( /// /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. /// - private async Task ExecuteBackgroundTasksAsync(CancellationToken cancellationToken) + private Task ExecuteShortBackgroundTasksAsync(CancellationToken cancellationToken) { - await ExecuteActionOnIntervalAsync(backgroundRunInterval, BatchScheduler.PerformBackgroundTasksAsync, cancellationToken); + return ExecuteActionOnIntervalAsync(shortBackgroundRunInterval, BatchScheduler.PerformShortBackgroundTasksAsync, cancellationToken); + } + + /// + /// Retrieves all event blobs from storage and updates the resultant state. + /// + /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. + /// + private async Task ExecuteLongBackgroundTasksAsync(CancellationToken cancellationToken) + { + await ExecuteActionOnIntervalAsync(longBackgroundRunInterval, + async token => await Task.WhenAll(BatchScheduler.PerformLongBackgroundTasksAsync(token).ToBlockingEnumerable(token)), + cancellationToken); } /// @@ -98,16 +112,16 @@ private async Task ExecuteBackgroundTasksAsync(CancellationToken cancellationTok /// /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. /// - private async Task ExecuteCancelledTesTasksOnBatchAsync(CancellationToken cancellationToken) + private Task ExecuteCancelledTesTasksOnBatchAsync(CancellationToken cancellationToken) { - var query = new Func>>( + Func>> query = new( async token => (await Repository.GetItemsAsync( predicate: t => t.State == TesState.CANCELING, cancellationToken: token)) .OrderByDescending(t => t.CreationTime) .ToAsyncEnumerable()); - await ExecuteActionOnIntervalAsync(BatchRunInterval, + return ExecuteActionOnIntervalAsync(BatchRunInterval, token => OrchestrateTesTasksOnBatchAsync( "Cancelled", query, @@ -124,9 +138,9 @@ await ExecuteActionOnIntervalAsync(BatchRunInterval, /// /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. /// - private async Task ExecuteUpdateTesTaskFromEventBlobAsync(CancellationToken cancellationToken) + private Task ExecuteUpdateTesTaskFromEventBlobAsync(CancellationToken cancellationToken) { - await ExecuteActionOnIntervalAsync(blobRunInterval, + return ExecuteActionOnIntervalAsync(blobRunInterval, async token => await UpdateTesTasksFromAvailableEventsAsync( await ParseAvailableEvents(token), @@ -160,6 +174,10 @@ await Parallel.ForEachAsync(BatchScheduler.GetEventMessagesAsync(cancellationTok var state = await nodeEventProcessor.GetMessageBatchStateAsync(eventMessage, tesTask, token); messages.Add((eventMessage, tesTask, state, ct => nodeEventProcessor.MarkMessageProcessedAsync(eventMessage, ct))); } + catch (OperationCanceledException) when (token.IsCancellationRequested) + { + throw; + } catch (ArgumentException ex) { Logger.LogError(ex, @"Verifying event metadata failed: {ErrorMessage}", ex.Message); @@ -240,6 +258,10 @@ await Parallel.ForEachAsync(eventStates.Select(@event => @event.MarkProcessedAsy { await markEventProcessed(token); } + catch (OperationCanceledException) when (token.IsCancellationRequested) + { + throw; + } catch (Exception ex) { Logger.LogError(ex, @"Failed to tag event as processed."); From eb4c7806a21136dc377c01c37ffaad2c744c6d01 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 14 Oct 2024 10:13:02 -0700 Subject: [PATCH 147/202] formatting --- src/TesApi.Web/BatchScheduler.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index e9ee20af6..472850feb 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -762,8 +762,8 @@ private static void QueuedTesTaskAddTaskEntryToQueueFromDirectory( tasks.Add(task); } + // Remove enqueued entries from directory without leaving empty entries. This is a loop because we are using ConcurrentDirectory for (enqueue(key, tasks); // Add tasks to the outside "queue" (this happens only once) - // Remove enqueued entries from directory without leaving empty entries. This is a loop because we are using ConcurrentDirectory !(queue.IsEmpty switch // The for loop ends when TryRemove or TryUpdate return true. { true => dictionary.TryRemove(new(key, refValue)), // Attempt queue removal from dictionary From 4409ca65cf8f445d8d37ca017e307203c9eee5ea Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 14 Oct 2024 18:10:39 -0700 Subject: [PATCH 148/202] several fixes --- src/Tes.Runner/Events/EventMessage.cs | 4 +- src/Tes.Runner/Events/EventsPublisher.cs | 4 +- src/TesApi.Tests/BatchSchedulerTests.cs | 2 +- src/TesApi.Web/BatchPool.cs | 58 +++++++++++++------ src/TesApi.Web/BatchPools.BatchScheduler.cs | 2 +- src/TesApi.Web/BatchScheduler.cs | 31 ++++------ src/TesApi.Web/Events/RunnerEventsMessage.cs | 10 ++++ .../Events/RunnerEventsProcessor.cs | 6 +- src/TesApi.Web/IBatchScheduler.cs | 54 ++++++++++++----- src/TesApi.Web/PoolScheduler.cs | 9 ++- src/TesApi.Web/TaskScheduler.cs | 52 ++++++++++++++++- 11 files changed, 165 insertions(+), 67 deletions(-) diff --git a/src/Tes.Runner/Events/EventMessage.cs b/src/Tes.Runner/Events/EventMessage.cs index 9ef9ce59a..89574f17f 100644 --- a/src/Tes.Runner/Events/EventMessage.cs +++ b/src/Tes.Runner/Events/EventMessage.cs @@ -32,10 +32,10 @@ public sealed class EventMessage public DateTime Created { get; set; } [JsonPropertyName("eventVersion")] - public string EventVersion { get; set; } = null!; + public Version EventVersion { get; set; } = null!; [JsonPropertyName("eventDataVersion")] - public string EventDataVersion { get; set; } = null!; + public Version EventDataVersion { get; set; } = null!; [JsonPropertyName("eventData")] public Dictionary? EventData { get; set; } diff --git a/src/Tes.Runner/Events/EventsPublisher.cs b/src/Tes.Runner/Events/EventsPublisher.cs index f92d958d6..39e4f7364 100644 --- a/src/Tes.Runner/Events/EventsPublisher.cs +++ b/src/Tes.Runner/Events/EventsPublisher.cs @@ -11,8 +11,8 @@ namespace Tes.Runner.Events; public class EventsPublisher : IAsyncDisposable { - public const string EventVersion = "1.0"; - public const string EventDataVersion = "1.0"; + public static readonly Version EventVersion = new(1, 0); + public static readonly Version EventDataVersion = new(1, 0); public const string TesTaskRunnerEntityType = "TesRunnerTask"; public const string DownloadStartEvent = "downloadStart"; public const string DownloadEndEvent = "downloadEnd"; diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 4d60162e5..933eb22dc 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -1363,7 +1363,7 @@ public async Task PoolIsCreatedWithoutPublicIpWhenSubnetAndDisableBatchNodesPubl } else { - await foreach (var _ in batchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, Enumerable.Repeat(azureProxyReturnValues.BatchTaskState, tesTasks.Length).ToArray(), CancellationToken.None)) { } + await Parallel.ForEachAsync(tesTasks, async (task, token) => _ = await batchScheduler.ProcessTesTaskBatchStateAsync(task, azureProxyReturnValues.BatchTaskState, token)); } var createBatchPoolAsyncInvocation = serviceProvider.BatchPoolManager.Invocations.FirstOrDefault(i => i.Method.Name == nameof(IBatchPoolManager.CreateBatchPoolAsync)); diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index aa171918a..89ca4e401 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Text.Json; using System.Threading; using System.Threading.Tasks; using Azure; @@ -29,7 +30,7 @@ public sealed partial class BatchPool public const string CloudPoolSelectClause = "creationTime,id,identity,metadata"; /// - /// Autoscale evalutation interval + /// Autoscale evaluation interval /// public static TimeSpan AutoScaleEvaluationInterval { get; } = TimeSpan.FromMinutes(5); @@ -131,7 +132,7 @@ private enum ScalingMode /// Reset state to . /// /// - /// This state exists to eliminate premature redisabling of autoscale mode. + /// This state exists to prevent premature re-disabling of autoscale mode. /// SettingAutoScale } @@ -153,15 +154,15 @@ private void EnsureScalingModeSet(bool? autoScaleEnabled) * If the the scaling mode does not correspond to the actual state of autoscale enablement, this method guides us towards the desired state. * * Barring outside intervention, at each and every time this method is called, the following should always hold true: - * |------------------|---------------------|-------------------------|-------------------------| - * | autoScaleEnabled | ScalingMode | Last action | Next action | - * |------------------|---------------------|-------------------------|-------------------------| - * | true | AutoScaleEnabled | Normal long-term state |Change for select errrors| - * | false | AutoScaleDisabled | Recently disabled AS | Perform needed actions | - * | false | RemovingFailedNodes | Manual resizing actions | Reenable autoscale mode | - * | true | WaitingForAutoScale | Ensure autoscale works | Delay and re-assess | - * | true | SettingAutoScale | Assess pool response | Restore normal long-term| - * |------------------|---------------------|-------------------------|-------------------------| + * |------------------|---------------------|-------------------------|--------------------------| + * | autoScaleEnabled | ScalingMode | Last action | Next action | + * |------------------|---------------------|-------------------------|--------------------------| + * | true | AutoScaleEnabled | Normal long-term state | Change for select errors | + * | false | AutoScaleDisabled | Recently disabled AS | Perform needed actions | + * | false | RemovingFailedNodes | Manual resizing actions | Reenable autoscale mode | + * | true | WaitingForAutoScale | Ensure autoscale works | Delay and re-assess | + * | true | SettingAutoScale | Assess pool response | Restore normal long-term | + * |------------------|---------------------|-------------------------|--------------------------| * * The first time this method is called, ScalingMode will be Unknown. Initialize it to an appropriate value to initialize the state machine's state. * If autoScaleEnabled is null, don't change anything. @@ -274,7 +275,7 @@ public static string AutoPoolFormula(bool preemptable, int initialTarget) Notes on the formula: Reference: https://docs.microsoft.com/en-us/azure/batch/batch-automatic-scaling - In order to avoid confusion, some of the builtin variable names in batch's autoscale formulas are named in a way that may not initially appear intuitive: + In order to avoid confusion, some of the built-in variable names in batch's autoscale formulas are named in a way that may not initially appear intuitive: Running tasks are named RunningTasks, which is fine Queued tasks are named ActiveTasks, which matches the same value of the "state" property The sum of running & queued tasks (what I would have named TotalTasks) is named PendingTasks @@ -286,7 +287,7 @@ public static string AutoPoolFormula(bool preemptable, int initialTarget) Whenever autoscaling is turned on, whether or not the pool was just created, there are no sampled metrics available. Thus, we need to prevent the expected errors that would result from trying to extract the samples. Later on, if recent samples aren't available, we prefer that the formula fails (firstly, so we can potentially capture that, and secondly, so that we don't suddenly try to remove all nodes from the pool when there's still demand) - so we use a timed scheme to substitue an "initial value" (aka initialTarget). + so we use a timed scheme to substitute an "initial value" (aka initialTarget). We set NodeDeallocationOption to taskcompletion to prevent wasting time/money by stopping a running task, only to requeue it onto another node, or worse, fail it, just because batch's last sample was taken longer ago than a task's assignment was made to a node, because the formula evaluations intervals are not coordinated @@ -787,7 +788,7 @@ public async ValueTask AssignPoolAsync(CloudPool pool, string runnerMD5, bool fo try { broken |= !pool.Metadata.Any(m => BatchScheduler.PoolMetadata.Equals(m.Name, StringComparison.Ordinal)) || - !IBatchScheduler.PoolMetadata.Create(pool.Metadata.Single(m => BatchScheduler.PoolMetadata.Equals(m.Name, StringComparison.Ordinal)).Value).Validate(); + !IBatchScheduler.PoolMetadata.Create(pool.Metadata.Single(m => BatchScheduler.PoolMetadata.Equals(m.Name, StringComparison.Ordinal)).Value).Validate(validateEventsVersion: false); } catch (InvalidOperationException) { @@ -801,7 +802,7 @@ public async ValueTask AssignPoolAsync(CloudPool pool, string runnerMD5, bool fo { broken = true; } - catch (System.Text.Json.JsonException) + catch (JsonException) { broken = true; } @@ -825,9 +826,22 @@ private void Configure(CloudPool pool, string runnerMD5, bool forceRemove) { ArgumentNullException.ThrowIfNull(pool); + var metadata = IBatchScheduler.PoolMetadata.Create(pool.Metadata.First(m => BatchScheduler.PoolMetadata.Equals(m.Name, StringComparison.Ordinal)).Value); + PoolId = pool.Id; + + var eventVersionsMatch = Tes.Runner.Events.EventsPublisher.EventVersion.Equals(GetVersion(metadata.EventsVersion[nameof(Tes.Runner.Events.EventsPublisher.EventVersion)])) && + Tes.Runner.Events.EventsPublisher.EventDataVersion.Equals(GetVersion(metadata.EventsVersion[nameof(Tes.Runner.Events.EventsPublisher.EventDataVersion)])); + IsAvailable = !forceRemove && DetermineIsAvailable(pool.CreationTime) && - runnerMD5.Equals(IBatchScheduler.PoolMetadata.Create(pool.Metadata.Single(m => BatchScheduler.PoolMetadata.Equals(m.Name, StringComparison.Ordinal)).Value).RunnerMD5, StringComparison.OrdinalIgnoreCase); + runnerMD5.Equals(metadata.RunnerMD5, StringComparison.OrdinalIgnoreCase) && + eventVersionsMatch; + + if (!eventVersionsMatch) + { + // + } + //IReadOnlyDictionary Identity = pool.Identity.UserAssignedIdentities.ToDictionary(identity => identity.ResourceId, identity => identity.ClientId, StringComparer.OrdinalIgnoreCase).AsReadOnly(); if (IsAvailable) @@ -835,8 +849,18 @@ private void Configure(CloudPool pool, string runnerMD5, bool forceRemove) Creation = pool.CreationTime.Value; } - IsDedicated = IBatchScheduler.PoolMetadata.Create(pool.Metadata.First(m => BatchScheduler.PoolMetadata.Equals(m.Name, StringComparison.Ordinal)).Value).IsDedicated; + IsDedicated = metadata.IsDedicated; _ = _batchPools.AddPool(this); + + static Version GetVersion(object value) + => value switch + { + JsonElement jsonElement when JsonValueKind.String.Equals(jsonElement.ValueKind) => new(jsonElement.ToString()), + JsonElement jsonElement when JsonValueKind.Number.Equals(jsonElement.ValueKind) => new(jsonElement.ToString()), + Version versionValue => versionValue, + string stringValue => new(stringValue), + _ => (Version)Convert.ChangeType(value, typeof(Version)), + }; } /// diff --git a/src/TesApi.Web/BatchPools.BatchScheduler.cs b/src/TesApi.Web/BatchPools.BatchScheduler.cs index d33b04872..4f7dd8093 100644 --- a/src/TesApi.Web/BatchPools.BatchScheduler.cs +++ b/src/TesApi.Web/BatchPools.BatchScheduler.cs @@ -149,7 +149,7 @@ internal async Task GetOrAddPoolAsync(string key, bool isPreemptable RandomNumberGenerator.Fill(uniquifier); var poolId = $"{key}-{uniquifier.ConvertToBase32().TrimEnd('=').ToLowerInvariant()}"; // embedded '-' is required by GetKeyFromPoolId() var modelPool = await modelPoolFactory(poolId, cancellationToken); - modelPool.Metadata.Add(new(PoolMetadata, new IBatchScheduler.PoolMetadata(this.batchPrefix, !isPreemptable, this.runnerMD5).ToString())); + modelPool.Metadata.Add(new(PoolMetadata, new IBatchScheduler.PoolMetadata(this.batchPrefix, !isPreemptable, this.runnerMD5, Events.RunnerEventsMessage.EventsVersion).ToString())); pool = batchPoolFactory(); await pool.CreatePoolAndJobAsync(modelPool, isPreemptable, runnerMD5, cancellationToken); } diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 472850feb..a7da54914 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -479,18 +479,19 @@ public async Task UploadTaskRunnerIfNeededAsync(CancellationToken cancellationTo } /// - public IAsyncEnumerable> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken) + public ValueTask ProcessTesTaskBatchStateAsync(TesTask tesTask, AzureBatchTaskState taskState, CancellationToken cancellationToken) { - ArgumentNullException.ThrowIfNull(tesTasks); - ArgumentNullException.ThrowIfNull(taskStates); + ArgumentNullException.ThrowIfNull(tesTask); + ArgumentNullException.ThrowIfNull(taskState); - return taskStates.Zip(tesTasks, (TaskState, TesTask) => (TaskState, TesTask)) - .Where(entry => entry.TesTask?.IsActiveState() ?? false) // Removes already terminal (and null) TesTasks from being further processed. - .Select(entry => new RelatedTask(WrapHandleTesTaskTransitionAsync(entry.TesTask, entry.TaskState, cancellationToken), entry.TesTask)) - .WhenEach(cancellationToken, tesTaskTask => tesTaskTask.Task); + if (!tesTask.IsActiveState()) // Ignore this state + { + return ValueTask.FromResult(false); + } - async Task WrapHandleTesTaskTransitionAsync(TesTask tesTask, AzureBatchTaskState azureBatchTaskState, CancellationToken cancellationToken) - => await HandleTesTaskTransitionAsync(tesTask, azureBatchTaskState, cancellationToken); + return tesTaskStateTransitions + .FirstOrDefault(m => (m.Condition is null || m.Condition(tesTask)) && (m.CurrentBatchTaskState is null || m.CurrentBatchTaskState == taskState.State)) + .ActionAsync(tesTask, taskState, cancellationToken); } /// @@ -1059,18 +1060,6 @@ Task QueuedTesTaskHandleExceptionAsync(Exception exception, string poolKey return Task.FromResult(true); } - /// - /// Transitions the to the new state, based on the rules defined in the tesTaskStateTransitions list. - /// - /// TES task - /// Current Azure Batch task info - /// A for controlling the lifetime of the asynchronous operation. - /// True if the TES task was changed. - private ValueTask HandleTesTaskTransitionAsync(TesTask tesTask, AzureBatchTaskState azureBatchTaskState, CancellationToken cancellationToken) - => tesTaskStateTransitions - .FirstOrDefault(m => (m.Condition is null || m.Condition(tesTask)) && (m.CurrentBatchTaskState is null || m.CurrentBatchTaskState == azureBatchTaskState.State)) - .ActionAsync(tesTask, azureBatchTaskState, cancellationToken); - private async Task ConvertTesTaskToBatchTaskUsingRunnerAsync(string taskId, TesTask task, string acrPullIdentity, string vmFamily, CancellationToken cancellationToken) { diff --git a/src/TesApi.Web/Events/RunnerEventsMessage.cs b/src/TesApi.Web/Events/RunnerEventsMessage.cs index 1116d1588..7e02b19d3 100644 --- a/src/TesApi.Web/Events/RunnerEventsMessage.cs +++ b/src/TesApi.Web/Events/RunnerEventsMessage.cs @@ -15,6 +15,16 @@ namespace TesApi.Web.Events /// The content of the event message. public record struct RunnerEventsMessage(Uri BlobUri, IDictionary Tags, string Event, Tes.Runner.Events.EventMessage RunnerEventMessage = default) { + /// + /// Task runner events version. + /// + internal static readonly IDictionary EventsVersion = new Dictionary(StringComparer.OrdinalIgnoreCase) + { + { nameof(Tes.Runner.Events.EventsPublisher.EventVersion), Tes.Runner.Events.EventsPublisher.EventVersion}, + { nameof(Tes.Runner.Events.EventsPublisher.TesTaskRunnerEntityType), Tes.Runner.Events.EventsPublisher.TesTaskRunnerEntityType }, + { nameof(Tes.Runner.Events.EventsPublisher.EventDataVersion), Tes.Runner.Events.EventsPublisher.EventDataVersion } + }; + /// /// Copy constructor replacing . /// diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index bc30dd5f4..a64f538d6 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -103,7 +103,7 @@ public async Task DownloadAndValidateMessageContentAsync(Ru try { var messageText = await azureProxy.DownloadBlobAsync(message.BlobUri, cancellationToken); - content = System.Text.Json.JsonSerializer.Deserialize(messageText) + content = System.Text.Json.JsonSerializer.Deserialize(messageText, Tes.Runner.Events.EventMessageContext.Default.EventMessage) ?? throw new DownloadOrParseException("Deserialize() returned null."); } catch (Exception ex) @@ -116,9 +116,9 @@ public async Task DownloadAndValidateMessageContentAsync(Ru // Validate content Validate(Guid.TryParse(content.Id, out _), $"{nameof(content.Id)}('{content.Id}') is malformed."); - Validate(Tes.Runner.Events.EventsPublisher.EventVersion.Equals(content.EventVersion, StringComparison.Ordinal), + Validate(Tes.Runner.Events.EventsPublisher.EventVersion.Equals(content.EventVersion), $"{nameof(content.EventVersion)}('{content.EventVersion}') is not recognized."); - Validate(Tes.Runner.Events.EventsPublisher.EventDataVersion.Equals(content.EventDataVersion, StringComparison.Ordinal), + Validate(Tes.Runner.Events.EventsPublisher.EventDataVersion.Equals(content.EventDataVersion), $"{nameof(content.EventDataVersion)}('{content.EventDataVersion}') is not recognized."); Validate(Tes.Runner.Events.EventsPublisher.TesTaskRunnerEntityType.Equals(content.EntityType, StringComparison.Ordinal), $"{nameof(content.EntityType)}('{content.EntityType}') is not recognized."); diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index c306d5524..56e5f05fc 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -3,6 +3,8 @@ using System; using System.Collections.Generic; +using System.Linq; +using System.Text.Json; using System.Threading; using System.Threading.Tasks; using Microsoft.Azure.Batch; @@ -37,13 +39,13 @@ public interface IBatchScheduler Task UploadTaskRunnerIfNeededAsync(CancellationToken cancellationToken); /// - /// Updates s with task-related state on a batch system + /// Updates the with task-related state on a batch system /// - /// s to schedule on the batch system. - /// s corresponding to each . + /// The TES task + /// Current Azure Batch task state info /// A for controlling the lifetime of the asynchronous operation. - /// True for each corresponding that needs to be persisted. - IAsyncEnumerable> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken); + /// True if the TES task was changed. + ValueTask ProcessTesTaskBatchStateAsync(TesTask tesTask, AzureBatchTaskState taskState, CancellationToken cancellationToken); /// /// Schedules a s on a batch system @@ -137,17 +139,18 @@ public record struct CloudTaskId(string JobId, string TaskId, DateTime Created); /// /// TES metadata carried in the batch pool. /// - /// - /// - /// - record struct PoolMetadata(string HostName, bool IsDedicated, string RunnerMD5) + /// . + /// Compute nodes in pool are not preemptible. + /// NodeTaskRunner hash. + /// . + record struct PoolMetadata(string HostName, bool IsDedicated, string RunnerMD5, IDictionary EventsVersion) { - private static readonly System.Text.Json.JsonSerializerOptions Options = new(System.Text.Json.JsonSerializerDefaults.Web); + private static readonly JsonSerializerOptions Options = new(JsonSerializerDefaults.Web); /// public override readonly string ToString() { - return System.Text.Json.JsonSerializer.Serialize(this, Options); + return JsonSerializer.Serialize(this, Options); } /// @@ -157,12 +160,35 @@ public override readonly string ToString() /// . public static PoolMetadata Create(string value) { - return System.Text.Json.JsonSerializer.Deserialize(value, Options); + return JsonSerializer.Deserialize(value, Options); } - internal readonly bool Validate() + internal readonly bool Validate(bool validateEventsVersion) { - return !(string.IsNullOrWhiteSpace(HostName) || string.IsNullOrWhiteSpace(RunnerMD5)); + if (string.IsNullOrWhiteSpace(HostName) || string.IsNullOrWhiteSpace(RunnerMD5)) + { + return false; + } + + return validateEventsVersion + ? !NormalizeForValidation(Events.RunnerEventsMessage.EventsVersion).OrderBy(pair => pair.Key, StringComparer.OrdinalIgnoreCase) + .SequenceEqual(NormalizeForValidation(EventsVersion).OrderBy(pair => pair.Key, StringComparer.OrdinalIgnoreCase)) + : Events.RunnerEventsMessage.EventsVersion.Keys.Order(StringComparer.OrdinalIgnoreCase) + .SequenceEqual(EventsVersion.Keys.Order(StringComparer.OrdinalIgnoreCase), StringComparer.OrdinalIgnoreCase); + } + + static Dictionary NormalizeForValidation(IDictionary value) + { + return value + .OrderBy(pair => pair.Key) + .Select(static pair => new KeyValuePair(pair.Key.ToUpperInvariant(), pair.Value switch + { + JsonElement element when JsonValueKind.Number.Equals(element.ValueKind) => new Version(element.ToString()), + JsonElement element when JsonValueKind.String.Equals(element.ValueKind) => new Version(element.ToString()), + string stringValue => new Version(stringValue), + _ => pair.Value + })) + .ToDictionary(); } } } diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index c4941c520..5efc0842d 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -24,11 +24,14 @@ namespace TesApi.Web /// Used for requesting termination of the current application during initialization. /// The main TES task database repository implementation. /// The batch scheduler implementation. + /// The batch scheduler implementation. /// The logger instance. /// - internal class PoolScheduler(Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger logger) + internal class PoolScheduler(Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ITaskScheduler taskScheduler, ILogger logger) : OrchestrateOnBatchSchedulerServiceBase(hostApplicationLifetime, repository, batchScheduler, logger) { + private readonly ITaskScheduler TaskScheduler = taskScheduler; + /// /// Interval between each call to . /// @@ -55,7 +58,7 @@ internal class PoolScheduler(Microsoft.Extensions.Hosting.IHostApplicationLifeti /// protected override void ExecuteSetup(CancellationToken cancellationToken) { - BatchScheduler.LoadExistingPoolsAsync(cancellationToken).Wait(cancellationToken); // Delay starting TaskScheduler until this completes to finish initializing the shared parts of BatchScheduler. + BatchScheduler.LoadExistingPoolsAsync(cancellationToken).Wait(cancellationToken); // Delay starting PoolScheduler until this completes to finish initializing the shared parts of BatchScheduler. } /// @@ -176,7 +179,7 @@ await Parallel.ForEachAsync(states, cancellationToken, async (state, token) => await OrchestrateTesTasksOnBatchAsync( $"NodeState ({poolId})", _ => ValueTask.FromResult(list.Select(t => t.TesTask).ToAsyncEnumerable()), - (tesTasks, token) => BatchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, list.Select(t => t.State).ToArray(), token), + (tesTasks, token) => TaskScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, list.Select(t => t.State).ToArray(), token), cancellationToken); } else diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index c581bf586..576f3929b 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -15,6 +15,30 @@ namespace TesApi.Web { + /// + /// An interface for scheduling s. + /// + internal interface ITaskScheduler + { + + /// + /// Schedules a + /// + /// A to schedule on the batch system. + /// A for controlling the lifetime of the asynchronous operation. + /// True to persist the , otherwise False. + Task ProcessQueuedTesTaskAsync(TesTask tesTask, CancellationToken cancellationToken); + + /// + /// Updates s with task-related state + /// + /// s to schedule on the batch system. + /// s corresponding to each . + /// A for controlling the lifetime of the asynchronous operation. + /// True for each corresponding that needs to be persisted. + IAsyncEnumerable> ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken); + } + /// /// A background service that schedules s in the batch system, orchestrates their lifecycle, and updates their state. /// This should only be used as a system-wide singleton service. This class does not support scale-out on multiple machines, @@ -26,7 +50,7 @@ namespace TesApi.Web /// The batch scheduler implementation. /// The logger instance. internal class TaskScheduler(RunnerEventsProcessor nodeEventProcessor, Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger taskSchedulerLogger) - : OrchestrateOnBatchSchedulerServiceBase(hostApplicationLifetime, repository, batchScheduler, taskSchedulerLogger) + : OrchestrateOnBatchSchedulerServiceBase(hostApplicationLifetime, repository, batchScheduler, taskSchedulerLogger), ITaskScheduler { private static readonly TimeSpan blobRunInterval = TimeSpan.FromSeconds(15); internal static readonly TimeSpan BatchRunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs appears to use a 30 second polling interval @@ -125,7 +149,7 @@ private Task ExecuteCancelledTesTasksOnBatchAsync(CancellationToken cancellation token => OrchestrateTesTasksOnBatchAsync( "Cancelled", query, - (tasks, ct) => BatchScheduler.ProcessTesTaskBatchStatesAsync( + (tasks, ct) => ((ITaskScheduler)this).ProcessTesTaskBatchStatesAsync( tasks, Enumerable.Repeat(new(AzureBatchTaskState.TaskState.CancellationRequested), tasks.Length).ToArray(), ct), @@ -248,7 +272,7 @@ private async ValueTask UpdateTesTasksFromAvailableEventsAsync(IEnumerable<(TesT await OrchestrateTesTasksOnBatchAsync( "NodeEvent", _ => ValueTask.FromResult(eventStates.Select(@event => @event.Task).ToAsyncEnumerable()), - (tesTasks, token) => BatchScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, eventStates.Select(@event => @event.State).ToArray(), token), + (tesTasks, token) => ((ITaskScheduler)this).ProcessTesTaskBatchStatesAsync(tesTasks, eventStates.Select(@event => @event.State).ToArray(), token), cancellationToken, "events"); @@ -268,5 +292,27 @@ await Parallel.ForEachAsync(eventStates.Select(@event => @event.MarkProcessedAsy } }); } + + /// + Task ITaskScheduler.ProcessQueuedTesTaskAsync(TesTask tesTask, CancellationToken cancellationToken) + { + return BatchScheduler.ProcessQueuedTesTaskAsync(tesTask, cancellationToken); + } + + /// + IAsyncEnumerable> ITaskScheduler.ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken) + { + ArgumentNullException.ThrowIfNull(tesTasks); + ArgumentNullException.ThrowIfNull(taskStates); + + return taskStates.Zip(tesTasks, (TaskState, TesTask) => (TaskState, TesTask)) + .Select(entry => new RelatedTask(entry.TesTask?.IsActiveState() ?? false // Removes already terminal (and null) TesTasks from being further processed. + ? WrapHandleTesTaskTransitionAsync(entry.TesTask, entry.TaskState, cancellationToken) + : Task.FromResult(false), entry.TesTask)) + .WhenEach(cancellationToken, tesTaskTask => tesTaskTask.Task); + + async Task WrapHandleTesTaskTransitionAsync(TesTask tesTask, AzureBatchTaskState azureBatchTaskState, CancellationToken cancellationToken) + => await BatchScheduler.ProcessTesTaskBatchStateAsync(tesTask, azureBatchTaskState, cancellationToken); + } } } From 3ecca373bfc1fc64d48225ff0db7d75ccb2de331 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 15 Oct 2024 02:40:37 -0700 Subject: [PATCH 149/202] fix DI --- src/TesApi.Web/AzureBatchTaskState.cs | 8 ++++---- src/TesApi.Web/Startup.cs | 4 +++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/TesApi.Web/AzureBatchTaskState.cs b/src/TesApi.Web/AzureBatchTaskState.cs index 422fc514f..26a769047 100644 --- a/src/TesApi.Web/AzureBatchTaskState.cs +++ b/src/TesApi.Web/AzureBatchTaskState.cs @@ -17,12 +17,12 @@ namespace TesApi.Web /// Failure information. /// . /// . - /// - /// - /// + /// . + /// . + /// . /// . /// . - /// Warning. First item in enumeration is the Warning code, rest of items are additional system log entries. + /// Warning. First item in enumeration is the Warning code, rest of items are related system log entries. public record class AzureBatchTaskState( TaskState State, IEnumerable OutputFileLogs = default, diff --git a/src/TesApi.Web/Startup.cs b/src/TesApi.Web/Startup.cs index 8d6d5cda9..40052e806 100644 --- a/src/TesApi.Web/Startup.cs +++ b/src/TesApi.Web/Startup.cs @@ -119,6 +119,8 @@ public void ConfigureServices(IServiceCollection services) .AddSingleton(CreateActionIdentityProvider) .AddSingleton() .AddSingleton() + .AddSingleton() + .AddSingleton(s => s.GetRequiredService()) .AddSingleton() .AddSingleton(s => ActivatorUtilities.CreateInstance(s, TerraOptionsAreConfigured(s))) .AddSingleton(CreateBatchAccountResourceInformation) @@ -299,7 +301,7 @@ static string ParseStorageUri(string uri) // Order is important for hosted services .AddHostedService(sp => (AllowedVmSizesService)sp.GetRequiredService(typeof(IAllowedVmSizesService))) .AddHostedService() - .AddHostedService(); + .AddHostedService(s => s.GetRequiredService()); } catch (Exception exc) { From e35dc8ca4b6afc810973b8a4a66cbd0428a0808c Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 15 Oct 2024 08:03:36 -0700 Subject: [PATCH 150/202] startup fixes --- src/Tes/Models/TesTaskExtended.cs | 4 +- .../Repository/TesTaskPostgreSqlRepository.cs | 37 ----- ...askPostgreSqlRepositoryIntegrationTests.cs | 2 +- .../TestServices/TestServiceProvider.cs | 24 ++- src/TesApi.Web/BatchPool.cs | 4 + src/TesApi.Web/BatchScheduler.cs | 33 ++-- src/TesApi.Web/Controllers/TaskServiceApi.cs | 5 +- src/TesApi.Web/IBatchPool.cs | 6 + src/TesApi.Web/IBatchScheduler.cs | 2 +- .../OrchestrateOnBatchSchedulerServiceBase.cs | 8 +- src/TesApi.Web/TaskScheduler.cs | 146 +++++++++++++----- 11 files changed, 171 insertions(+), 100 deletions(-) diff --git a/src/Tes/Models/TesTaskExtended.cs b/src/Tes/Models/TesTaskExtended.cs index b8e112d5d..bac6e43a0 100644 --- a/src/Tes/Models/TesTaskExtended.cs +++ b/src/Tes/Models/TesTaskExtended.cs @@ -102,9 +102,9 @@ public partial class TesTask : RepositoryItem /// True if task should be kept in the cache. /// /// - public bool IsActiveState(bool includePreempted = false) // TODO: consider using TesResources.BackendParameters to signal whether PREEMPTEDEnum is considered a terminal state + public bool IsActiveState(bool preemptedIsTerminal = false) // TODO: consider using TesResources.BackendParameters to signal whether PREEMPTEDEnum is considered a terminal state { - return !(includePreempted ? TerminalStatesWithPreempted : TerminalStates).Contains(this.State); + return !(preemptedIsTerminal ? TerminalStatesWithPreempted : TerminalStates).Contains(this.State); } /// diff --git a/src/Tes/Repository/TesTaskPostgreSqlRepository.cs b/src/Tes/Repository/TesTaskPostgreSqlRepository.cs index eefe22731..7c5f2815a 100644 --- a/src/Tes/Repository/TesTaskPostgreSqlRepository.cs +++ b/src/Tes/Repository/TesTaskPostgreSqlRepository.cs @@ -102,7 +102,6 @@ public TesTaskPostgreSqlRepository(IOptions options, Microsof { var dataSource = NpgsqlDataSourceFunc(ConnectionStringUtility.GetPostgresConnectionString(options)); // The datasource itself must be essentially a singleton. CreateDbContext = Initialize(() => new TesDbContext(dataSource, NpgsqlDbContextOptionsBuilder)); - WarmCacheAsync(CancellationToken.None).GetAwaiter().GetResult(); } /// @@ -122,42 +121,6 @@ private static Func Initialize(Func createDbContext) return createDbContext; } - private async Task WarmCacheAsync(CancellationToken cancellationToken) - { - if (Cache is null) - { - Logger?.LogWarning("Cache is null for TesTaskPostgreSqlRepository; no caching will be used."); - return; - } - - var sw = Stopwatch.StartNew(); - Logger?.LogInformation("Warming cache..."); - - // Don't allow the state of the system to change until the cache and system are consistent; - // this is a fast PostgreSQL query even for 1 million items - await Policy - .Handle() - .WaitAndRetryAsync(3, - retryAttempt => - { - Logger?.LogWarning("Warming cache retry attempt #{RetryAttempt}", retryAttempt); - return TimeSpan.FromSeconds(10); - }, - (ex, ts) => - { - Logger?.LogCritical(ex, "Couldn't warm cache, is the database online?"); - }) - .ExecuteAsync(async ct => - { - var activeTasksCount = (await InternalGetItemsAsync( - ct, - orderBy: q => q.OrderBy(t => t.Json.CreationTime), - efPredicates: Enumerable.Empty>>().Append(task => !TesTask.TerminalStates.Contains(task.State)))) - .Count(); - Logger?.LogInformation("Cache warmed successfully in {TotalSeconds:n3} seconds. Added {TasksAddedCount:n0} items to the cache.", sw.Elapsed.TotalSeconds, activeTasksCount); - }, cancellationToken); - } - /// public async Task TryGetItemAsync(string id, CancellationToken cancellationToken, Action onSuccess = null) diff --git a/src/TesApi.Tests/Repository/TesTaskPostgreSqlRepositoryIntegrationTests.cs b/src/TesApi.Tests/Repository/TesTaskPostgreSqlRepositoryIntegrationTests.cs index 2d3d64721..6251be0c4 100644 --- a/src/TesApi.Tests/Repository/TesTaskPostgreSqlRepositoryIntegrationTests.cs +++ b/src/TesApi.Tests/Repository/TesTaskPostgreSqlRepositoryIntegrationTests.cs @@ -139,7 +139,7 @@ public async Task ListTasksWithPagingRetrievesAllTesTasks() await Parallel.ForEachAsync(items, CancellationToken.None, async (item, token) => await repository.CreateItemAsync(item, token)); } - var controller = new TaskServiceApiController(repository, null, null, null); + var controller = new TaskServiceApiController(repository, null, null, null, null); string pageToken = null; var tesTaskIds = new HashSet(); diff --git a/src/TesApi.Tests/TestServices/TestServiceProvider.cs b/src/TesApi.Tests/TestServices/TestServiceProvider.cs index 972db8a7f..dc8f808f2 100644 --- a/src/TesApi.Tests/TestServices/TestServiceProvider.cs +++ b/src/TesApi.Tests/TestServices/TestServiceProvider.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Threading; using System.Threading.Tasks; using CommonUtilities; using CommonUtilities.Options; @@ -52,6 +53,7 @@ internal TestServiceProvider( var azureCloudConfig = ExpensiveObjectTestUtility.AzureCloudConfig; provider = new ServiceCollection() .AddSingleton(_ => new TesServiceInfo { CreatedAt = DateTimeOffset.UtcNow, Environment = "unittest", Id = "unit-test-id", Organization = new() { Name = "unit-test-org", Url = "http://localhost/" }, Storage = [], UpdatedAt = DateTimeOffset.UtcNow }) + .AddSingleton(_ => new BatchAccountResourceInformation("batchAccount", "resourceGroupName", "subscriptionId", "regionName", "https://batchAccount.regionName.batchSuffix")) .AddSingleton(azureCloudConfig) .AddSingleton(azureCloudConfig.AzureEnvironmentConfig) .AddSingleton() @@ -74,19 +76,21 @@ internal TestServiceProvider( .AddTransient>(_ => NullLogger.Instance) .IfThenElse(mockStorageAccessProvider, s => s, s => s.AddTransient>(_ => NullLogger.Instance)) .IfThenElse(batchSkuInformationProvider is null, - s => s.AddSingleton(sp => ActivatorUtilities.CreateInstance(sp)) - .AddSingleton(sp => new PriceApiBatchSkuInformationProvider(sp.GetRequiredService(), azureCloudConfig, sp.GetRequiredService>())), + s => s.AddSingleton(sp => new PriceApiBatchSkuInformationProvider(sp.GetRequiredService(), azureCloudConfig, sp.GetRequiredService>())), s => s.AddSingleton(_ => GetBatchSkuInformationProvider(batchSkuInformationProvider).Object)) .AddSingleton(_ => GetBatchQuotaProvider(batchQuotaProvider).Object) .AddTransient>(_ => NullLogger.Instance) + .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) + .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) .AddTransient>(_ => NullLogger.Instance) + .AddTransient>(_ => NullLogger.Instance) .AddSingleton() .AddTransient() .AddSingleton() @@ -99,7 +103,9 @@ internal TestServiceProvider( .AddSingleton() .AddSingleton() .AddSingleton(GetBatchPoolManager(batchPoolManager).Object) + .AddSingleton() .IfThenElse(additionalActions is null, s => { }, s => additionalActions(s)) + .AddSingleton() .BuildServiceProvider(); IOptions BindHelper(string key) where TOption : class, new() @@ -228,6 +234,20 @@ private Mock GetStorageAccessProvider(Action throw new NotImplementedException(); + + public CancellationToken ApplicationStopped => throw new NotImplementedException(); + + public CancellationToken ApplicationStopping => throw new NotImplementedException(); + + public void StopApplication() + { + throw new NotImplementedException(); + } + } + public void Dispose() => (provider as IDisposable)?.Dispose(); diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 89ca4e401..5aa84b3d0 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -2,6 +2,7 @@ // Licensed under the MIT License. using System; +using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Text.Json; @@ -491,6 +492,9 @@ internal enum ServiceKind /// public Queue ResizeErrors { get; } = new(); + /// + public ConcurrentDictionary AssociatedTesTasks { get; } = []; + /// public async ValueTask CanBeDeletedAsync(CancellationToken cancellationToken = default) { diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index a7da54914..778c82d78 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -450,8 +450,16 @@ internal static string CreateWgetDownloadCommand(Uri urlToDownload, string local private IAsyncEnumerable GetCloudPools(CancellationToken cancellationToken) => azureProxy.GetActivePoolsAsync(batchPrefix); + private Lazy _loadExistingPools = null; + /// - public async Task LoadExistingPoolsAsync(CancellationToken cancellationToken) + public Task LoadExistingPoolsAsync(CancellationToken cancellationToken) + { + _ = Interlocked.CompareExchange(ref _loadExistingPools, new(() => LoadExistingPoolsImplAsync(cancellationToken)), null); + return _loadExistingPools.Value; + } + + private async Task LoadExistingPoolsImplAsync(CancellationToken cancellationToken) { await foreach (var cloudPool in GetCloudPools(cancellationToken).WithCancellation(cancellationToken)) { @@ -656,7 +664,7 @@ private async ValueTask AttachQueuedTesTaskToBatchPoolAsync(string poolKey if (pool is null) { - TaskCompletionSource poolCompletion = new(); // This provides the poolId of the pool provided for the task + TaskCompletionSource poolCompletion = new(); // This provides the poolId of the pool provided for the task AddTValueToCollectorQueue( key: poolKey, value: new PendingPoolRequest(poolKey, virtualMachineInfo, poolCompletion), @@ -665,12 +673,7 @@ private async ValueTask AttachQueuedTesTaskToBatchPoolAsync(string poolKey groupGatherWindow: QueuedTesTaskPoolGroupGatherWindow, maxCount: int.MaxValue); - pool = batchPools.GetPoolOrDefault(await poolCompletion.Task); // This ensures that the pool is managed by this BatchScheduler - - if (pool is null) - { - throw new System.Diagnostics.UnreachableException("Pool should have been obtained by this point."); - } + pool = await poolCompletion.Task; } var tesTaskLog = tesTask.AddTesTaskLog(); @@ -678,6 +681,7 @@ private async ValueTask AttachQueuedTesTaskToBatchPoolAsync(string poolKey var cloudTaskId = $"{tesTask.Id}-{tesTask.Logs.Count}"; tesTask.PoolId = pool.PoolId; var cloudTask = await ConvertTesTaskToBatchTaskUsingRunnerAsync(cloudTaskId, tesTask, virtualMachineInfo.Identities.Last(), virtualMachineInfo.VM.VmFamily, cancellationToken); + _ = pool.AssociatedTesTasks.AddOrUpdate(tesTask.Id, key => cloudTask.Id, (key, value) => cloudTask.Id); logger.LogInformation(@"Creating batch task for TES task {TesTaskId}. Using VM size {VmSize}.", tesTask.Id, virtualMachineInfo.VM.VmSize); @@ -820,7 +824,7 @@ private static async Task PerformTaskAsync(Func me if (taskCompletions.Any()) { - taskCompletions.ForEach(completion => _ = completion.TrySetException(new AggregateException(Enumerable.Empty().Append(exception)))); + taskCompletions.ForEach(completion => _ = completion.TrySetException(exception)); } else { @@ -847,7 +851,7 @@ private static async Task PerformTaskOfTAsync(Func _ = completion.TrySetException(new AggregateException(Enumerable.Empty().Append(exception)))); + taskCompletions.ForEach(completion => _ = completion.TrySetException(exception)); } else { @@ -949,7 +953,7 @@ bool TryRemovePool(string key, out IEnumerable result) } } - // Create batch pools + // Obtain batch pools while (_queuedTesTaskPendingPools.TryDequeue(out var pool)) { logger.LogDebug(@"Creating pool for {PoolKey}.", pool.PoolKey); @@ -966,8 +970,7 @@ bool TryRemovePool(string key, out IEnumerable result) initialTarget: pool.InitialTarget, nodeInfo: (useGen2 ?? false) ? gen2BatchNodeInfo : gen1BatchNodeInfo, cancellationToken: ct), - cancellationToken: token)) - .PoolId, + cancellationToken: token)), taskCompletions: pool.TaskCompletions, cancellationToken: cancellationToken)); } @@ -1807,8 +1810,8 @@ public CombinedBatchTaskInfo(AzureBatchTaskState state, string alternateSystemLo // Records managing the processing of TesTasks in Queued status private record struct PendingCloudTask(CloudTask CloudTask, TaskCompletionSource TaskCompletion); - private record struct PendingPoolRequest(string PoolKey, VirtualMachineInformationWithDataDisks VirtualMachineInfo, TaskCompletionSource TaskCompletion); - private record struct PendingPool(string PoolKey, VirtualMachineInformationWithDataDisks VirtualMachineInfo, int InitialTarget, IEnumerable> TaskCompletions); + private record struct PendingPoolRequest(string PoolKey, VirtualMachineInformationWithDataDisks VirtualMachineInfo, TaskCompletionSource TaskCompletion); + private record struct PendingPool(string PoolKey, VirtualMachineInformationWithDataDisks VirtualMachineInfo, int InitialTarget, IEnumerable> TaskCompletions); private record struct ImmutableQueueWithTimer(Timer Timer, ImmutableQueue Queue); internal record class VirtualMachineInformationWithDataDisks(VirtualMachineInformation VM, IList DataDisks) diff --git a/src/TesApi.Web/Controllers/TaskServiceApi.cs b/src/TesApi.Web/Controllers/TaskServiceApi.cs index 08a6ccc6f..c4090cb6a 100644 --- a/src/TesApi.Web/Controllers/TaskServiceApi.cs +++ b/src/TesApi.Web/Controllers/TaskServiceApi.cs @@ -40,14 +40,16 @@ namespace TesApi.Controllers /// /// The main database repository /// The storage access provider + /// The task scheduler /// The logger instance /// The GA4GH TES service information - public class TaskServiceApiController(IRepository repository, IStorageAccessProvider storageAccessProvider, ILogger logger, TesServiceInfo serviceInfo) + public class TaskServiceApiController(IRepository repository, IStorageAccessProvider storageAccessProvider, ITaskScheduler taskScheduler, ILogger logger, TesServiceInfo serviceInfo) : ControllerBase { //private const string rootExecutionPath = "/cromwell-executions"; private readonly IRepository repository = repository; private readonly IStorageAccessProvider storageAccessProvider = storageAccessProvider; + private readonly ITaskScheduler taskScheduler = taskScheduler; private readonly ILogger logger = logger; private readonly TesServiceInfo serviceInfo = serviceInfo; @@ -267,6 +269,7 @@ public virtual async Task CreateTaskAsync([FromBody] TesTask tesT logger.LogDebug("Creating task with id {TesTask} state {TesTaskState}", tesTask.Id, tesTask.State); await repository.CreateItemAsync(tesTask, cancellationToken); + await taskScheduler.ProcessQueuedTesTaskAsync(tesTask, cancellationToken); return StatusCode(200, new TesCreateTaskResponse { Id = tesTask.Id }); } diff --git a/src/TesApi.Web/IBatchPool.cs b/src/TesApi.Web/IBatchPool.cs index 91eca1c0d..6dd65c4b7 100644 --- a/src/TesApi.Web/IBatchPool.cs +++ b/src/TesApi.Web/IBatchPool.cs @@ -2,6 +2,7 @@ // Licensed under the MIT License. using System; +using System.Collections.Concurrent; using System.Collections.Generic; using System.Threading; using System.Threading.Tasks; @@ -89,6 +90,11 @@ public interface IBatchPool /// ValueTask GetAllocationStateTransitionTimeAsync(CancellationToken cancellationToken = default); + /// + /// TesTasks associated with pool. + /// + ConcurrentDictionary AssociatedTesTasks { get; } + /// /// A with a compute node id. /// diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index 56e5f05fc..923b9adc1 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -84,7 +84,7 @@ public interface IBatchScheduler bool RemovePoolFromList(IBatchPool pool); /// - /// Flushes empty pools to accomodate pool quota limits. + /// Flushes empty pools to accommodate pool quota limits. /// /// Pool Ids of pools connected to active TES Tasks. Used to prevent accidentally removing active pools. /// A for controlling the lifetime of the asynchronous operation. diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 1e11d238e..b68cda0ff 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -307,7 +307,13 @@ static bool IsExceptionHttpConflictWhereTaskIsComplete(Exception exc) /// Tag to indicate the underlying unit quantity of items processed in log messages. /// True to process even if there are no tasks processed. /// A that represents this method's operations. - protected async ValueTask OrchestrateTesTasksOnBatchAsync(string pollName, Func>> tesTaskGetter, Func>> tesTaskProcessor, CancellationToken cancellationToken, string unitsLabel = "tasks", bool needPoolFlush = false) + protected async ValueTask OrchestrateTesTasksOnBatchAsync( + string pollName, + Func>> tesTaskGetter, + Func>> tesTaskProcessor, + CancellationToken cancellationToken, + string unitsLabel = "tasks", + bool needPoolFlush = false) { var tesTasks = await (await tesTaskGetter(cancellationToken)).ToArrayAsync(cancellationToken); var noTasks = tesTasks.All(task => task is null); diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index 576f3929b..4520b3609 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -6,6 +6,7 @@ using System.Collections.Generic; using System.Linq; using System.Threading; +using System.Threading.Channels; using System.Threading.Tasks; using Microsoft.Extensions.Logging; using Tes.Models; @@ -18,7 +19,7 @@ namespace TesApi.Web /// /// An interface for scheduling s. /// - internal interface ITaskScheduler + public interface ITaskScheduler { /// @@ -26,8 +27,7 @@ internal interface ITaskScheduler /// /// A to schedule on the batch system. /// A for controlling the lifetime of the asynchronous operation. - /// True to persist the , otherwise False. - Task ProcessQueuedTesTaskAsync(TesTask tesTask, CancellationToken cancellationToken); + Task ProcessQueuedTesTaskAsync(TesTask tesTask, CancellationToken cancellationToken); /// /// Updates s with task-related state @@ -50,7 +50,8 @@ internal interface ITaskScheduler /// The batch scheduler implementation. /// The logger instance. internal class TaskScheduler(RunnerEventsProcessor nodeEventProcessor, Microsoft.Extensions.Hosting.IHostApplicationLifetime hostApplicationLifetime, IRepository repository, IBatchScheduler batchScheduler, ILogger taskSchedulerLogger) - : OrchestrateOnBatchSchedulerServiceBase(hostApplicationLifetime, repository, batchScheduler, taskSchedulerLogger), ITaskScheduler + : OrchestrateOnBatchSchedulerServiceBase(hostApplicationLifetime, repository, batchScheduler, taskSchedulerLogger) + , ITaskScheduler { private static readonly TimeSpan blobRunInterval = TimeSpan.FromSeconds(15); internal static readonly TimeSpan BatchRunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs appears to use a 30 second polling interval @@ -58,6 +59,16 @@ internal class TaskScheduler(RunnerEventsProcessor nodeEventProcessor, Microsoft private static readonly TimeSpan longBackgroundRunInterval = TimeSpan.FromSeconds(1); private readonly RunnerEventsProcessor nodeEventProcessor = nodeEventProcessor; + /// + /// Checks to see if the hosted service is running. + /// + /// False if the service hasn't started up yet, True if it has started, throws TaskCanceledException if service is/has shutdown. + private bool IsRunning => stoppingToken is not null && (stoppingToken.Value.IsCancellationRequested ? throw new TaskCanceledException() : true); + + private CancellationToken? stoppingToken = null; + private readonly ConcurrentQueue queuedTesTasks = []; + private readonly ConcurrentQueue<(TesTask[] TesTasks, AzureBatchTaskState[] TaskStates, ChannelWriter> Channel)> tesTaskBatchStates = []; + /// protected override async ValueTask ExecuteSetupAsync(CancellationToken cancellationToken) { @@ -65,48 +76,87 @@ protected override async ValueTask ExecuteSetupAsync(CancellationToken cancellat { // Delay "starting" TaskScheduler until this completes to finish initializing BatchScheduler. await BatchScheduler.UploadTaskRunnerIfNeededAsync(cancellationToken); + // Ensure BatchScheduler has loaded existing pools before "starting". + await BatchScheduler.LoadExistingPoolsAsync(cancellationToken); } catch (Exception exc) { Logger.LogError(exc, @"Checking/storing the node task runner binary failed with {Message}", exc.Message); throw; } + + foreach (var tesTask in + (await Repository.GetItemsAsync( + predicate: t => t.IsActiveState(false), // TODO: preemptedIsTerminal + cancellationToken: cancellationToken)) + .OrderBy(t => t.CreationTime)) + { + try + { + if (TesState.QUEUED.Equals(tesTask.State) && string.IsNullOrWhiteSpace(tesTask.PoolId)) + { + queuedTesTasks.Enqueue(tesTask); + } + else + { + var pool = BatchScheduler.GetPools().SingleOrDefault(pool => tesTask.PoolId.Equals(pool.PoolId, StringComparison.OrdinalIgnoreCase)); + + if (pool is null) + { + queuedTesTasks.Enqueue(tesTask); // TODO: is there a better way to treat tasks that are not "queued" that are also not associated with any known pool? + } + else + { + _ = pool.AssociatedTesTasks.AddOrUpdate(tesTask.Id, key => null, (key, value) => value); + } + } + } + catch (Exception ex) + { + await ProcessOrchestratedTesTaskAsync("Initialization", new(Task.FromException(ex), tesTask), cancellationToken); + } + } } /// protected override async ValueTask ExecuteCoreAsync(CancellationToken cancellationToken) { - await Task.WhenAll( - ExecuteShortBackgroundTasksAsync(cancellationToken), - ExecuteLongBackgroundTasksAsync(cancellationToken), - ExecuteCancelledTesTasksOnBatchAsync(cancellationToken), - ExecuteQueuedTesTasksOnBatchAsync(cancellationToken), - ExecuteUpdateTesTaskFromEventBlobAsync(cancellationToken)); + stoppingToken = cancellationToken; + List queuedTasks = []; + + while (queuedTesTasks.TryDequeue(out var tesTask)) + { + queuedTasks.Add(((ITaskScheduler)this).ProcessQueuedTesTaskAsync(tesTask, cancellationToken)); + } + + while (tesTaskBatchStates.TryDequeue(out var result)) + { + queuedTasks.Add(ProcessQueuedTesTaskStatesRequestAsync(result.TesTasks, result.TaskStates, result.Channel, cancellationToken)); + } + + queuedTasks.Add(ExecuteShortBackgroundTasksAsync(cancellationToken)); + queuedTasks.Add(ExecuteLongBackgroundTasksAsync(cancellationToken)); + queuedTasks.Add(ExecuteCancelledTesTasksOnBatchAsync(cancellationToken)); + queuedTasks.Add(ExecuteUpdateTesTaskFromEventBlobAsync(cancellationToken)); + + await Task.WhenAll(queuedTasks); } - /// - /// Retrieves all queued TES tasks from the database, performs an action in the batch system, and updates the resultant state - /// - /// - private Task ExecuteQueuedTesTasksOnBatchAsync(CancellationToken cancellationToken) + private async Task ProcessQueuedTesTaskStatesRequestAsync(TesTask[] tesTasks, AzureBatchTaskState[] taskStates, ChannelWriter> channel, CancellationToken cancellationToken) { - Func>> query = new( - async token => (await Repository.GetItemsAsync( - predicate: t => t.State == TesState.QUEUED, - cancellationToken: token)) - .OrderBy(t => t.CreationTime)); - - return ExecuteActionOnIntervalAsync(BatchRunInterval, - async cancellation => + try + { + await foreach (var relatedTask in ((ITaskScheduler)this).ProcessTesTaskBatchStatesAsync(tesTasks, taskStates, cancellationToken)) { - await Parallel.ForEachAsync( - (await query(cancellation)) - .Select(task => new RelatedTask(BatchScheduler.ProcessQueuedTesTaskAsync(task, cancellation), task)) - .WhenEach(cancellation, task => task.Task), - cancellation, - (task, token) => ProcessOrchestratedTesTaskAsync("Queued", task, token)); - }, - cancellationToken); + await channel.WriteAsync(relatedTask, cancellationToken); + } + + channel.Complete(); + } + catch (Exception ex) + { + channel.Complete(ex); + } } /// @@ -294,9 +344,16 @@ await Parallel.ForEachAsync(eventStates.Select(@event => @event.MarkProcessedAsy } /// - Task ITaskScheduler.ProcessQueuedTesTaskAsync(TesTask tesTask, CancellationToken cancellationToken) + async Task ITaskScheduler.ProcessQueuedTesTaskAsync(TesTask tesTask, CancellationToken cancellationToken) { - return BatchScheduler.ProcessQueuedTesTaskAsync(tesTask, cancellationToken); + if (IsRunning) + { + await ProcessOrchestratedTesTaskAsync("Queued", new(BatchScheduler.ProcessQueuedTesTaskAsync(tesTask, cancellationToken), tesTask), cancellationToken); + } + else + { + queuedTesTasks.Enqueue(tesTask); + } } /// @@ -305,14 +362,23 @@ IAsyncEnumerable> ITaskScheduler.ProcessTesTaskBatchS ArgumentNullException.ThrowIfNull(tesTasks); ArgumentNullException.ThrowIfNull(taskStates); - return taskStates.Zip(tesTasks, (TaskState, TesTask) => (TaskState, TesTask)) - .Select(entry => new RelatedTask(entry.TesTask?.IsActiveState() ?? false // Removes already terminal (and null) TesTasks from being further processed. - ? WrapHandleTesTaskTransitionAsync(entry.TesTask, entry.TaskState, cancellationToken) - : Task.FromResult(false), entry.TesTask)) - .WhenEach(cancellationToken, tesTaskTask => tesTaskTask.Task); + if (IsRunning) + { + return taskStates.Zip(tesTasks, (TaskState, TesTask) => (TaskState, TesTask)) + .Select(entry => new RelatedTask(entry.TesTask?.IsActiveState() ?? false // Removes already terminal (and null) TesTasks from being further processed. + ? WrapHandleTesTaskTransitionAsync(entry.TesTask, entry.TaskState, cancellationToken) + : Task.FromResult(false), entry.TesTask)) + .WhenEach(cancellationToken, tesTaskTask => tesTaskTask.Task); - async Task WrapHandleTesTaskTransitionAsync(TesTask tesTask, AzureBatchTaskState azureBatchTaskState, CancellationToken cancellationToken) - => await BatchScheduler.ProcessTesTaskBatchStateAsync(tesTask, azureBatchTaskState, cancellationToken); + async Task WrapHandleTesTaskTransitionAsync(TesTask tesTask, AzureBatchTaskState azureBatchTaskState, CancellationToken cancellationToken) + => await BatchScheduler.ProcessTesTaskBatchStateAsync(tesTask, azureBatchTaskState, cancellationToken); + } + else + { + var channel = Channel.CreateBounded>(new BoundedChannelOptions(taskStates.Length) { SingleReader = true, SingleWriter = true }); + tesTaskBatchStates.Enqueue((tesTasks.ToArray(), taskStates, channel.Writer)); + return channel.Reader.ReadAllAsync(cancellationToken); + } } } } From 7119891ddbe079e8c50c01059d332be4fec5e396 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 15 Oct 2024 16:32:51 -0700 Subject: [PATCH 151/202] temp logging --- src/Tes/Tes.csproj | 1 - src/TesApi.Web/TaskScheduler.cs | 9 ++++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/Tes/Tes.csproj b/src/Tes/Tes.csproj index a826d23e5..df328d9f9 100644 --- a/src/Tes/Tes.csproj +++ b/src/Tes/Tes.csproj @@ -17,7 +17,6 @@ all runtime; build; native; contentfiles; analyzers; buildtransitive - diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index 4520b3609..d67678bac 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -77,7 +77,7 @@ protected override async ValueTask ExecuteSetupAsync(CancellationToken cancellat // Delay "starting" TaskScheduler until this completes to finish initializing BatchScheduler. await BatchScheduler.UploadTaskRunnerIfNeededAsync(cancellationToken); // Ensure BatchScheduler has loaded existing pools before "starting". - await BatchScheduler.LoadExistingPoolsAsync(cancellationToken); + //await BatchScheduler.LoadExistingPoolsAsync(cancellationToken); } catch (Exception exc) { @@ -85,6 +85,8 @@ protected override async ValueTask ExecuteSetupAsync(CancellationToken cancellat throw; } + Logger.LogDebug(@"Querying active tasks"); + foreach (var tesTask in (await Repository.GetItemsAsync( predicate: t => t.IsActiveState(false), // TODO: preemptedIsTerminal @@ -95,6 +97,7 @@ protected override async ValueTask ExecuteSetupAsync(CancellationToken cancellat { if (TesState.QUEUED.Equals(tesTask.State) && string.IsNullOrWhiteSpace(tesTask.PoolId)) { + Logger.LogDebug(@"Adding queued task from repository"); queuedTesTasks.Enqueue(tesTask); } else @@ -103,10 +106,12 @@ protected override async ValueTask ExecuteSetupAsync(CancellationToken cancellat if (pool is null) { + Logger.LogDebug(@"Adding task w/o pool id from repository"); queuedTesTasks.Enqueue(tesTask); // TODO: is there a better way to treat tasks that are not "queued" that are also not associated with any known pool? } else { + Logger.LogDebug(@"Adding task to pool w/o cloudtask"); _ = pool.AssociatedTesTasks.AddOrUpdate(tesTask.Id, key => null, (key, value) => value); } } @@ -139,6 +144,8 @@ protected override async ValueTask ExecuteCoreAsync(CancellationToken cancellati queuedTasks.Add(ExecuteCancelledTesTasksOnBatchAsync(cancellationToken)); queuedTasks.Add(ExecuteUpdateTesTaskFromEventBlobAsync(cancellationToken)); + Logger.LogDebug(@"Task load: {TaskCount}", queuedTasks.Count); + await Task.WhenAll(queuedTasks); } From 007df4d78614fa200621f5093f856ec5780b85a1 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 17 Oct 2024 12:58:31 -0700 Subject: [PATCH 152/202] more temp logging --- src/Tes/Repository/TesTaskPostgreSqlRepository.cs | 3 +-- src/TesApi.Web/BatchScheduler.cs | 4 ++++ src/TesApi.Web/TaskScheduler.cs | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/Tes/Repository/TesTaskPostgreSqlRepository.cs b/src/Tes/Repository/TesTaskPostgreSqlRepository.cs index 7c5f2815a..0be0c82e1 100644 --- a/src/Tes/Repository/TesTaskPostgreSqlRepository.cs +++ b/src/Tes/Repository/TesTaskPostgreSqlRepository.cs @@ -5,7 +5,6 @@ namespace Tes.Repository { using System; using System.Collections.Generic; - using System.Diagnostics; using System.Linq; using System.Linq.Expressions; using System.Reflection; @@ -18,7 +17,6 @@ namespace Tes.Repository using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Npgsql; - using Polly; using Tes.Models; using Tes.Utilities; @@ -140,6 +138,7 @@ public async Task TryGetItemAsync(string id, CancellationToken cancellatio /// public async Task> GetItemsAsync(Expression> predicate, CancellationToken cancellationToken) { + Logger.LogWarning(@"TesTaskPostgreSqlRepository::GetItemsAsync called"); // TODO: remove this log return (await InternalGetItemsAsync(cancellationToken, efPredicates: [predicate])).Select(t => t.TesTask); } diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 778c82d78..37fc54e81 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -479,10 +479,14 @@ private async Task LoadExistingPoolsImplAsync(CancellationToken cancellationToke public async Task UploadTaskRunnerIfNeededAsync(CancellationToken cancellationToken) { var blobUri = await storageAccessProvider.GetInternalTesBlobUrlAsync(NodeTaskRunnerFilename, storageAccessProvider.BlobPermissionsWithWrite, cancellationToken); + logger.LogDebug(@"NodeTaskRunner Uri: {NodeTaskRunnerUri}", new BlobUriBuilder(blobUri) { Sas = null }.ToUri().AbsoluteUri); var blobProperties = await azureProxy.GetBlobPropertiesAsync(blobUri, cancellationToken); + logger.LogDebug(@"NodeTaskRunner MD5: {NodeTaskRunnerMD5}", Convert.ToBase64String(blobProperties?.ContentHash ?? [])); if (!runnerMD5.Equals(Convert.ToBase64String(blobProperties?.ContentHash ?? []), StringComparison.OrdinalIgnoreCase)) { + logger.LogInformation(@"Uploading NodeTaskRunner"); await azureProxy.UploadBlobFromFileAsync(blobUri, $"scripts/{NodeTaskRunnerFilename}", cancellationToken); + logger.LogInformation(@"Uploaded NodeTaskRunner"); } } diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index d67678bac..b56eb34a7 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -121,6 +121,8 @@ protected override async ValueTask ExecuteSetupAsync(CancellationToken cancellat await ProcessOrchestratedTesTaskAsync("Initialization", new(Task.FromException(ex), tesTask), cancellationToken); } } + + Logger.LogDebug(@"Active tasks processed"); } /// From 4005a23ad0c90afa95b2b4d500845b53d0a8b721 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 17 Oct 2024 14:56:41 -0700 Subject: [PATCH 153/202] Repository retry logging --- src/TesApi.Web/RepositoryRetryHandler.cs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/TesApi.Web/RepositoryRetryHandler.cs b/src/TesApi.Web/RepositoryRetryHandler.cs index c14c38eb5..8018d826f 100644 --- a/src/TesApi.Web/RepositoryRetryHandler.cs +++ b/src/TesApi.Web/RepositoryRetryHandler.cs @@ -9,7 +9,6 @@ using CommonUtilities.Options; using Microsoft.Extensions.Options; using Polly; -using Polly.Retry; using Tes.Repository; namespace TesApi.Web @@ -21,23 +20,23 @@ namespace TesApi.Web public sealed class RepositoryRetryHandler : IRepository where T : RepositoryItem { private readonly IRepository _repository; - private readonly AsyncRetryPolicy _asyncRetryPolicy; + private readonly IAsyncPolicy _asyncRetryPolicy; /// /// Constructor for . /// /// The to wrap. /// The to use. Note that we will quadruple the max retry count set in options. - public RepositoryRetryHandler(IRepository repository, IOptions retryPolicyOptions) + /// An instance used to perform logging. + public RepositoryRetryHandler(IRepository repository, IOptions retryPolicyOptions, Microsoft.Extensions.Logging.ILogger> logger) { ArgumentNullException.ThrowIfNull(repository); ArgumentNullException.ThrowIfNull(retryPolicyOptions); - _asyncRetryPolicy = Policy - .Handle() - .WaitAndRetryAsync(retryPolicyOptions.Value.MaxRetryCount * 4, - (attempt) => TimeSpan.FromSeconds(Math.Pow(retryPolicyOptions.Value.ExponentialBackOffExponent, - attempt))); + _asyncRetryPolicy = new CommonUtilities.RetryPolicyBuilder(retryPolicyOptions) + .DefaultRetryPolicyBuilder() + .SetOnRetryBehavior(logger) + .AsyncBuildPolicy(); _repository = repository; } From 244e0d88ee330437e51a11572816a6f5ebc7b01a Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 17 Oct 2024 16:52:43 -0700 Subject: [PATCH 154/202] Fix repository retry logging --- src/TesApi.Web/RepositoryRetryHandler.cs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/TesApi.Web/RepositoryRetryHandler.cs b/src/TesApi.Web/RepositoryRetryHandler.cs index 8018d826f..c9dc9303b 100644 --- a/src/TesApi.Web/RepositoryRetryHandler.cs +++ b/src/TesApi.Web/RepositoryRetryHandler.cs @@ -6,9 +6,9 @@ using System.Linq.Expressions; using System.Threading; using System.Threading.Tasks; +using CommonUtilities; using CommonUtilities.Options; using Microsoft.Extensions.Options; -using Polly; using Tes.Repository; namespace TesApi.Web @@ -20,7 +20,7 @@ namespace TesApi.Web public sealed class RepositoryRetryHandler : IRepository where T : RepositoryItem { private readonly IRepository _repository; - private readonly IAsyncPolicy _asyncRetryPolicy; + private readonly RetryHandler.AsyncRetryHandlerPolicy _asyncRetryPolicy; /// /// Constructor for . @@ -33,10 +33,10 @@ public RepositoryRetryHandler(IRepository repository, IOptions repository, IOptions public Task CreateItemAsync(T item, CancellationToken cancellationToken) - => _asyncRetryPolicy.ExecuteAsync(ct => _repository.CreateItemAsync(item, ct), cancellationToken); + => _asyncRetryPolicy.ExecuteWithRetryAsync(ct => _repository.CreateItemAsync(item, ct), cancellationToken); /// public Task DeleteItemAsync(string id, CancellationToken cancellationToken) - => _asyncRetryPolicy.ExecuteAsync(ct => _repository.DeleteItemAsync(id, ct), cancellationToken); + => _asyncRetryPolicy.ExecuteWithRetryAsync(ct => _repository.DeleteItemAsync(id, ct), cancellationToken); /// public Task> GetItemsAsync(Expression> predicate, CancellationToken cancellationToken) - => _asyncRetryPolicy.ExecuteAsync(ct => _repository.GetItemsAsync(predicate, ct), cancellationToken); + => _asyncRetryPolicy.ExecuteWithRetryAsync(ct => _repository.GetItemsAsync(predicate, ct), cancellationToken); /// public Task.GetItemsResult> GetItemsAsync(string continuationToken, int pageSize, CancellationToken cancellationToken, FormattableString rawPredicate, IEnumerable>> predicates) - => _asyncRetryPolicy.ExecuteAsync(ct => _repository.GetItemsAsync(continuationToken, pageSize, ct, rawPredicate, predicates), cancellationToken); + => _asyncRetryPolicy.ExecuteWithRetryAsync(ct => _repository.GetItemsAsync(continuationToken, pageSize, ct, rawPredicate, predicates), cancellationToken); /// public Task TryGetItemAsync(string id, CancellationToken cancellationToken, Action onSuccess) - => _asyncRetryPolicy.ExecuteAsync(ct => _repository.TryGetItemAsync(id, ct, onSuccess), cancellationToken); + => _asyncRetryPolicy.ExecuteWithRetryAsync(ct => _repository.TryGetItemAsync(id, ct, onSuccess), cancellationToken); /// public Task UpdateItemAsync(T item, CancellationToken cancellationToken) - => _asyncRetryPolicy.ExecuteAsync(ct => _repository.UpdateItemAsync(item, ct), cancellationToken); + => _asyncRetryPolicy.ExecuteWithRetryAsync(ct => _repository.UpdateItemAsync(item, ct), cancellationToken); /// public ValueTask TryRemoveItemFromCacheAsync(T item, CancellationToken cancellationToken) From 8554d30da087eb7d6e0592b8083432f637201e9e Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 18 Oct 2024 07:36:30 -0700 Subject: [PATCH 155/202] Add ability to utilize enumerable retry sleep providers --- src/CommonUtilities/CommonUtilities.csproj | 1 + src/CommonUtilities/RetryHandler.cs | 5 + src/CommonUtilities/RetryPolicyBuilder.cs | 198 +++++++++++++++++---- 3 files changed, 172 insertions(+), 32 deletions(-) diff --git a/src/CommonUtilities/CommonUtilities.csproj b/src/CommonUtilities/CommonUtilities.csproj index 94f9f07dd..e98425d6f 100644 --- a/src/CommonUtilities/CommonUtilities.csproj +++ b/src/CommonUtilities/CommonUtilities.csproj @@ -20,6 +20,7 @@ + diff --git a/src/CommonUtilities/RetryHandler.cs b/src/CommonUtilities/RetryHandler.cs index e080fd208..9f1dd821c 100644 --- a/src/CommonUtilities/RetryHandler.cs +++ b/src/CommonUtilities/RetryHandler.cs @@ -14,11 +14,16 @@ public static class RetryHandler /// Polly Context key for caller method name /// public const string CallerMemberNameKey = $"Tes.ApiClients.{nameof(RetryHandler)}.CallerMemberName"; + /// /// Polly Context key for backup skip increment setting /// public const string BackupSkipProvidedIncrementKey = $"Tes.ApiClients.{nameof(RetryHandler)}.BackupSkipProvidedIncrementCount"; + /// Polly Context key combined sleep method and enumerable duration policies + /// + public const string CombineSleepDurationsKey = $"Tes.ApiClients.{nameof(RetryHandler)}.CombineSleepDurations"; + #region RetryHandlerPolicies /// /// Non-generic synchronous retry policy diff --git a/src/CommonUtilities/RetryPolicyBuilder.cs b/src/CommonUtilities/RetryPolicyBuilder.cs index e528635ac..215075f50 100644 --- a/src/CommonUtilities/RetryPolicyBuilder.cs +++ b/src/CommonUtilities/RetryPolicyBuilder.cs @@ -115,7 +115,7 @@ public interface IPolicyBuilderBase /// Wait policy that can return to use the backup wait policy. /// Maximum number of retries. /// Value in seconds which is raised by the power of the backup retry attempt. - /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. + /// True to pass backup wait provider its own attempt values, False to provide overall attempt values. /// OnRetry hander IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, int maxRetryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements); @@ -123,7 +123,7 @@ public interface IPolicyBuilderBase /// Custom optional exception-based wait policy backed up by the default wait policy. /// /// Wait policy that can return to use the backup wait policy. - /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. + /// True to pass backup wait provider its own attempt values, False to provide overall attempt values. /// OnRetry hander IPolicyBuilderWait WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func waitDurationProvider, bool backupSkipProvidedIncrements); } @@ -170,7 +170,7 @@ public interface IPolicyBuilderBase /// Wait policy that can return to use the backup wait policy. /// Maximum number of retries. /// Value in seconds which is raised by the power of the backup retry attempt. - /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. + /// True to pass backup wait provider its own attempt values, False to provide overall attempt values. /// OnRetry hander IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, int retryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements); @@ -178,7 +178,7 @@ public interface IPolicyBuilderBase /// Custom optional exception-based wait policy backed up by the default wait policy. /// /// Wait policy that can return to use the backup wait policy. - /// True to pass backup wait provider its own attempt values, False to provide overall attemp values. + /// True to pass backup wait provider its own attempt values, False to provide overall attempt values. /// OnRetry hander IPolicyBuilderWait WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func waitDurationProvider, bool backupSkipProvidedIncrements); } @@ -240,7 +240,7 @@ public interface IPolicyBuilderBuild IAsyncPolicy AsyncBuildPolicy(); /// - /// Retrives the instance of the retryhandler to accomodate extensions to the builder + /// Retrieves the instance of the retryhandler to accommodate extensions to the builder /// RetryPolicyBuilder PolicyBuilderBase { get; } } @@ -272,7 +272,7 @@ public interface IPolicyBuilderBuild IAsyncPolicy AsyncBuildPolicy(); /// - /// Retrives the instance of the retryhandler to accomodate extensions to the builder + /// Retrieves the instance of the retryhandler to accommodate extensions to the builder /// RetryPolicyBuilder PolicyBuilderBase { get; } } @@ -426,54 +426,171 @@ IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWi private readonly struct PolicyBuilderWait : IPolicyBuilderWait { public readonly PolicyBuilderBase builderBase; - public readonly Func sleepDurationProvider; + public readonly Func? sleepDurationProvider; + public readonly IEnumerable? sleepDurationsEnumerable; public readonly int maxRetryCount; - public PolicyBuilderWait(PolicyBuilderBase builderBase, int maxRetryCount, Func sleepDurationProvider) + private static Func CombineSleepDurations(Func provider, IEnumerable enumerable) { - ArgumentNullException.ThrowIfNull(sleepDurationProvider); + var combined = enumerable.Select(span => new Func(duration => + { + try + { + return duration + span; + } + catch (OverflowException) + { + return TimeSpan.MaxValue; + } + })).ToList(); + + return new((attempt, exception, context) => + { + List> stored; + + if (attempt == 1) + { + context[RetryHandler.CombineSleepDurationsKey] = stored = combined; + } + else if (context.TryGetValue(RetryHandler.CombineSleepDurationsKey, out var value) && value is List> foundValue) + { + stored = foundValue; + } + else + { + throw new System.Diagnostics.UnreachableException($"{RetryHandler.CombineSleepDurationsKey} should have been set in Polly Context at first retry"); + } + + var final = stored[attempt - 1](provider(attempt, exception, context)); + return final; + }); + } + + public PolicyBuilderWait(PolicyBuilderBase builderBase, int maxRetryCount, Func? sleepDurationProvider = default, IEnumerable? sleepDurationsEnumerable = default, bool combineSleepDurations = false) + { + if (sleepDurationProvider is null && sleepDurationsEnumerable is null) + { + throw new ArgumentNullException(null, $"At least one of {nameof(sleepDurationProvider)} or {nameof(sleepDurationsEnumerable)} must be provided."); + } + + if (combineSleepDurations && (sleepDurationProvider is null || sleepDurationsEnumerable is null)) + { + throw new ArgumentException("Both sleepDurationsEnumerable and a sleep durations provider must be provided.", nameof(combineSleepDurations)); + } + this.builderBase = builderBase; this.maxRetryCount = maxRetryCount; - this.sleepDurationProvider = sleepDurationProvider; + + if (combineSleepDurations) + { + this.sleepDurationProvider = CombineSleepDurations(sleepDurationProvider!, sleepDurationsEnumerable!); + this.sleepDurationsEnumerable = null; + } + else + { + this.sleepDurationProvider = sleepDurationProvider; + this.sleepDurationsEnumerable = sleepDurationsEnumerable; + } + + if (this.sleepDurationProvider is not null && this.sleepDurationsEnumerable is not null) + { + throw new ArgumentException($"{nameof(sleepDurationsEnumerable)} overrides {nameof(sleepDurationProvider)}", nameof(sleepDurationsEnumerable)); + } } /// IPolicyBuilderBuild IPolicyBuilderWait.SetOnRetryBehavior(ILogger? logger, RetryHandler.OnRetryHandler? onRetry, RetryHandler.OnRetryHandlerAsync? onRetryAsync) - => new PolicyBuilderBuild(this, sleepDurationProvider, logger, onRetry, onRetryAsync); + => new PolicyBuilderBuild(this, sleepDurationProvider, sleepDurationsEnumerable, logger, onRetry, onRetryAsync); } private readonly struct PolicyBuilderWait : IPolicyBuilderWait { public readonly PolicyBuilderBase builderBase; - public readonly Func? sleepDurationProvider; public readonly Func, Context, TimeSpan>? genericSleepDurationProvider; + public readonly IEnumerable? sleepDurationsEnumerable; public readonly int maxRetryCount; - private static Func, Context, TimeSpan> PickSleepDurationProvider(Func, Context, TimeSpan>? tResultProvider, Func? exceptionProvider) - => tResultProvider is null ? (attempt, outcome, ctx) => exceptionProvider!(attempt, outcome.Exception, ctx) : tResultProvider; + private static Func, Context, TimeSpan>? PickSleepDurationProvider(Func, Context, TimeSpan>? tResultProvider, Func? exceptionProvider) + => tResultProvider is null ? (exceptionProvider is null ? null : (attempt, outcome, ctx) => exceptionProvider(attempt, outcome.Exception, ctx)) : tResultProvider; + + private static Func, Context, TimeSpan> CombineSleepDurations(Func, Context, TimeSpan> provider, IEnumerable enumerable) + { + var combined = enumerable.Select(span => new Func(duration => + { + try + { + return duration + span; + } + catch (OverflowException) + { + return TimeSpan.MaxValue; + } + })).ToList(); + + return new((attempt, result, context) => + { + List> stored; + + if(attempt == 1) + { + context[RetryHandler.CombineSleepDurationsKey] = stored = combined; + } + else if (context.TryGetValue(RetryHandler.CombineSleepDurationsKey, out var value) && value is List> foundValue) + { + stored = foundValue; + } + else + { + throw new System.Diagnostics.UnreachableException($"{RetryHandler.CombineSleepDurationsKey} should have been set in Polly Context at first retry"); + } + + var final = stored[attempt - 1](provider(attempt, result, context)); + return final; + }); + } - public PolicyBuilderWait(PolicyBuilderBase builderBase, int maxRetryCount, Func, Context, TimeSpan>? sleepDurationProviderResult, Func? sleepDurationProviderException) + public PolicyBuilderWait(PolicyBuilderBase builderBase, int maxRetryCount, Func, Context, TimeSpan>? sleepDurationProviderResult = default, Func? sleepDurationProviderException = default, IEnumerable? sleepDurationsEnumerable = default, bool combineSleepDurations = false) { - if (sleepDurationProviderException is null && sleepDurationProviderResult is null) + if (sleepDurationProviderException is null && sleepDurationProviderResult is null && sleepDurationsEnumerable is null) { - throw new ArgumentNullException(null, $"At least one of {nameof(sleepDurationProviderResult)} or {nameof(sleepDurationProviderException)} must be provided."); + throw new ArgumentNullException(null, $"At least one of {nameof(sleepDurationProviderResult)}, {nameof(sleepDurationProviderException)} or {nameof(sleepDurationsEnumerable)} must be provided."); + } + + if (combineSleepDurations && ((sleepDurationProviderResult is null && sleepDurationProviderException is null) || sleepDurationsEnumerable is null)) + { + throw new ArgumentException("Both sleepDurationsEnumerable and a sleep durations provider must be provided.", nameof(combineSleepDurations)); } this.builderBase = builderBase; this.maxRetryCount = maxRetryCount; - this.sleepDurationProvider = sleepDurationProviderException; - this.genericSleepDurationProvider = sleepDurationProviderResult; + + if (combineSleepDurations) + { + this.genericSleepDurationProvider = CombineSleepDurations(PickSleepDurationProvider(genericSleepDurationProvider, sleepDurationProviderException)!, sleepDurationsEnumerable!); + this.sleepDurationsEnumerable = null; + } + else + { + this.genericSleepDurationProvider = PickSleepDurationProvider(genericSleepDurationProvider, sleepDurationProviderException); + this.sleepDurationsEnumerable = sleepDurationsEnumerable; + } + + if (this.genericSleepDurationProvider is not null && this.sleepDurationsEnumerable is not null) + { + throw new ArgumentException($"{nameof(sleepDurationsEnumerable)} overrides {nameof(sleepDurationProviderResult)} and {nameof(sleepDurationProviderException)}", nameof(sleepDurationsEnumerable)); + } } /// IPolicyBuilderBuild IPolicyBuilderWait.SetOnRetryBehavior(ILogger? logger, RetryHandler.OnRetryHandler? onRetry, RetryHandler.OnRetryHandlerAsync? onRetryAsync) - => new PolicyBuilderBuild(this, PickSleepDurationProvider(genericSleepDurationProvider, sleepDurationProvider), logger, onRetry, onRetryAsync); + => new PolicyBuilderBuild(this, genericSleepDurationProvider, sleepDurationsEnumerable, logger, onRetry, onRetryAsync); } private readonly struct PolicyBuilderBuild : IPolicyBuilderBuild { private readonly PolicyBuilderWait builderWait; - private readonly Func sleepDurationProvider; + private readonly Func? sleepDurationProvider; + public readonly IEnumerable? sleepDurationsEnumerable; private readonly ILogger? logger; private readonly RetryHandler.OnRetryHandler? onRetryHandler; private readonly RetryHandler.OnRetryHandlerAsync? onRetryHandlerAsync; @@ -481,11 +598,16 @@ IPolicyBuilderBuild IPolicyBuilderWait.SetOnRetryBehavior(ILog /// public RetryPolicyBuilder PolicyBuilderBase { get; } - public PolicyBuilderBuild(PolicyBuilderWait builderWait, Func sleepDurationProvider, ILogger? logger, RetryHandler.OnRetryHandler? onRetry, RetryHandler.OnRetryHandlerAsync? onRetryAsync) + public PolicyBuilderBuild(PolicyBuilderWait builderWait, Func? sleepDurationProvider, IEnumerable? sleepDurationsEnumerable, ILogger? logger, RetryHandler.OnRetryHandler? onRetry, RetryHandler.OnRetryHandlerAsync? onRetryAsync) { - ArgumentNullException.ThrowIfNull(sleepDurationProvider); + if (sleepDurationsEnumerable is null) + { + ArgumentNullException.ThrowIfNull(sleepDurationProvider); + } + this.builderWait = builderWait; this.sleepDurationProvider = sleepDurationProvider; + this.sleepDurationsEnumerable = sleepDurationsEnumerable; this.logger = logger; this.onRetryHandler = onRetry; this.onRetryHandlerAsync = onRetryAsync; @@ -528,19 +650,23 @@ public static Func OnRetryHandlerAsync( /// ISyncPolicy IPolicyBuilderBuild.SyncBuildPolicy() { - var waitProvider = sleepDurationProvider; + var waitProvider = sleepDurationProvider!; var onRetryProvider = OnRetryHandler(logger, onRetryHandler); - return builderWait.builderBase.policyBuilder.WaitAndRetry(builderWait.maxRetryCount, (attempt, ctx) => waitProvider(attempt, default, ctx), onRetryProvider); + return sleepDurationsEnumerable is null + ? builderWait.builderBase.policyBuilder.WaitAndRetry(builderWait.maxRetryCount, (attempt, ctx) => waitProvider(attempt, default, ctx), onRetryProvider) + : builderWait.builderBase.policyBuilder.WaitAndRetry(sleepDurationsEnumerable, onRetryProvider); } /// IAsyncPolicy IPolicyBuilderBuild.AsyncBuildPolicy() { - var waitProvider = sleepDurationProvider; + var waitProvider = sleepDurationProvider!; var onRetryProvider = OnRetryHandlerAsync(logger, onRetryHandler, onRetryHandlerAsync); - return builderWait.builderBase.policyBuilder.WaitAndRetryAsync(builderWait.maxRetryCount, waitProvider, onRetryProvider); + return sleepDurationsEnumerable is null + ? builderWait.builderBase.policyBuilder.WaitAndRetryAsync(builderWait.maxRetryCount, waitProvider, onRetryProvider) + : builderWait.builderBase.policyBuilder.WaitAndRetryAsync(sleepDurationsEnumerable, onRetryProvider); } /// @@ -555,7 +681,8 @@ RetryHandler.AsyncRetryHandlerPolicy IPolicyBuilderBuild.AsyncBuild() private readonly struct PolicyBuilderBuild : IPolicyBuilderBuild { private readonly PolicyBuilderWait builderWait; - private readonly Func, Context, TimeSpan> sleepDurationProvider; + private readonly Func, Context, TimeSpan>? sleepDurationProvider; + public readonly IEnumerable? sleepDurationsEnumerable; private readonly ILogger? logger; private readonly RetryHandler.OnRetryHandler? onRetryHandler; private readonly RetryHandler.OnRetryHandlerAsync? onRetryHandlerAsync; @@ -563,11 +690,16 @@ RetryHandler.AsyncRetryHandlerPolicy IPolicyBuilderBuild.AsyncBuild() /// public RetryPolicyBuilder PolicyBuilderBase { get; } - public PolicyBuilderBuild(PolicyBuilderWait builderWait, Func, Context, TimeSpan> sleepDurationProvider, ILogger? logger, RetryHandler.OnRetryHandler? onRetry, RetryHandler.OnRetryHandlerAsync? onRetryAsync) + public PolicyBuilderBuild(PolicyBuilderWait builderWait, Func, Context, TimeSpan>? sleepDurationProvider, IEnumerable? sleepDurationsEnumerable, ILogger? logger, RetryHandler.OnRetryHandler? onRetry, RetryHandler.OnRetryHandlerAsync? onRetryAsync) { - ArgumentNullException.ThrowIfNull(sleepDurationProvider); + if (sleepDurationsEnumerable is null) + { + ArgumentNullException.ThrowIfNull(sleepDurationProvider); + } + this.builderWait = builderWait; this.sleepDurationProvider = sleepDurationProvider; + this.sleepDurationsEnumerable = sleepDurationsEnumerable; this.logger = logger; this.onRetryHandler = onRetry; this.onRetryHandlerAsync = onRetryAsync; @@ -628,10 +760,12 @@ private static Func, TimeSpan, int, Context, Task> OnRet /// IAsyncPolicy IPolicyBuilderBuild.AsyncBuildPolicy() { - var waitProvider = sleepDurationProvider; + var waitProvider = sleepDurationProvider!; var onRetryProvider = OnRetryHandlerAsync(logger, onRetryHandler, onRetryHandlerAsync); - return builderWait.builderBase.policyBuilder.WaitAndRetryAsync(builderWait.maxRetryCount, waitProvider, onRetryProvider); + return sleepDurationsEnumerable is null + ? builderWait.builderBase.policyBuilder.WaitAndRetryAsync(builderWait.maxRetryCount, waitProvider, onRetryProvider) + : builderWait.builderBase.policyBuilder.WaitAndRetryAsync(sleepDurationsEnumerable, onRetryProvider); } ///// From 049c4cb3f5bc827af26211079507a9b0018b8dd9 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 18 Oct 2024 08:40:54 -0700 Subject: [PATCH 156/202] Add jitter by default --- src/CommonUtilities/RetryPolicyBuilder.cs | 97 +++++++++++++++++------ src/TesApi.Web/AzureProxy.cs | 10 +-- src/TesApi.Web/CachingWithRetriesBase.cs | 8 +- 3 files changed, 77 insertions(+), 38 deletions(-) diff --git a/src/CommonUtilities/RetryPolicyBuilder.cs b/src/CommonUtilities/RetryPolicyBuilder.cs index 215075f50..aeeaab447 100644 --- a/src/CommonUtilities/RetryPolicyBuilder.cs +++ b/src/CommonUtilities/RetryPolicyBuilder.cs @@ -93,6 +93,12 @@ public interface IPolicyBuilderBase /// OnRetry hander IPolicyBuilderWait WithRetryPolicyOptionsWait(); + /// + /// Default exponential wait policy. + /// + /// OnRetry hander + IPolicyBuilderWait WithExponentialBackoffWait(); + /// /// Custom exponential wait policy. /// @@ -110,22 +116,29 @@ public interface IPolicyBuilderBase IPolicyBuilderWait WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func waitDurationProvider); /// - /// Custom optional exception-based wait policy backed up by an exponential wait policy. + /// Custom optional exception-based wait policy backed up by the default wait policy. + /// + /// Wait policy that can return to use the backup wait policy. + /// OnRetry hander + IPolicyBuilderWait WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func waitDurationProvider); + + /// + /// Custom optional exception-based wait policy backed up by the default exponential wait policy. /// /// Wait policy that can return to use the backup wait policy. - /// Maximum number of retries. - /// Value in seconds which is raised by the power of the backup retry attempt. /// True to pass backup wait provider its own attempt values, False to provide overall attempt values. /// OnRetry hander - IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, int maxRetryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements); + IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, bool backupSkipProvidedIncrements); /// - /// Custom optional exception-based wait policy backed up by the default wait policy. + /// Custom optional exception-based wait policy backed up by an exponential wait policy. /// /// Wait policy that can return to use the backup wait policy. + /// Maximum number of retries. + /// Value in seconds which is raised by the power of the backup retry attempt. /// True to pass backup wait provider its own attempt values, False to provide overall attempt values. /// OnRetry hander - IPolicyBuilderWait WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func waitDurationProvider, bool backupSkipProvidedIncrements); + IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, int maxRetryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements); } /// @@ -139,6 +152,12 @@ public interface IPolicyBuilderBase /// OnRetry hander IPolicyBuilderWait WithRetryPolicyOptionsWait(); + /// + /// Default exponential wait policy. + /// + /// OnRetry hander + IPolicyBuilderWait WithExponentialBackoffWait(); + /// /// Custom exponential wait policy. /// @@ -165,22 +184,29 @@ public interface IPolicyBuilderBase IPolicyBuilderWait WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func waitDurationProvider); /// - /// Custom optional exception-based wait policy backed up by an exponential wait policy. + /// Custom optional exception-based wait policy backed up by the default wait policy. + /// + /// Wait policy that can return to use the backup wait policy. + /// OnRetry hander + IPolicyBuilderWait WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func waitDurationProvider); + + /// + /// Custom optional exception-based wait policy backed up by the default exponential wait policy. /// /// Wait policy that can return to use the backup wait policy. - /// Maximum number of retries. - /// Value in seconds which is raised by the power of the backup retry attempt. /// True to pass backup wait provider its own attempt values, False to provide overall attempt values. /// OnRetry hander - IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, int retryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements); + IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, bool backupSkipProvidedIncrements); /// - /// Custom optional exception-based wait policy backed up by the default wait policy. + /// Custom optional exception-based wait policy backed up by an exponential wait policy. /// /// Wait policy that can return to use the backup wait policy. + /// Maximum number of retries. + /// Value in seconds which is raised by the power of the backup retry attempt. /// True to pass backup wait provider its own attempt values, False to provide overall attempt values. /// OnRetry hander - IPolicyBuilderWait WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func waitDurationProvider, bool backupSkipProvidedIncrements); + IPolicyBuilderWait WithExceptionBasedWaitWithExponentialBackoffBackup(Func waitDurationProvider, int retryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements); } /// @@ -329,7 +355,10 @@ public PolicyBuilderBase(PolicyBuilder policyBuilder, Defaults defaults) Defaults = defaults; } - public static Func DefaultSleepDurationProvider(Defaults defaults) + public static IEnumerable DefaultSleepDurationProvider(Defaults defaults) + => Polly.Contrib.WaitAndRetry.Backoff.DecorrelatedJitterBackoffV2(TimeSpan.FromSeconds(defaults.PolicyOptions.ExponentialBackOffExponent), defaults.PolicyOptions.MaxRetryCount); + + public static Func DefaultExponentialSleepDurationProvider(Defaults defaults) => ExponentialSleepDurationProvider(defaults.PolicyOptions.ExponentialBackOffExponent); public static Func ExponentialSleepDurationProvider(double exponentialBackOffExponent) @@ -366,23 +395,31 @@ TimeSpan AdjustAttemptIfNeeded() /// IPolicyBuilderWait IPolicyBuilderBase.WithRetryPolicyOptionsWait() - => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, DefaultSleepDurationProvider(Defaults)); + => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, sleepDurationsEnumerable: DefaultSleepDurationProvider(Defaults)); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithExponentialBackoffWait() + => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, sleepDurationProvider: DefaultExponentialSleepDurationProvider(Defaults)); /// IPolicyBuilderWait IPolicyBuilderBase.WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func sleepDurationProvider) - => new PolicyBuilderWait(this, maxRetryCount, (attempt, outcome, _1) => sleepDurationProvider(attempt, outcome)); + => new PolicyBuilderWait(this, maxRetryCount, sleepDurationProvider: (attempt, outcome, _1) => sleepDurationProvider(attempt, outcome)); /// IPolicyBuilderWait IPolicyBuilderBase.WithExponentialBackoffWait(int retryCount, double exponentialBackOffExponent) - => new PolicyBuilderWait(this, retryCount, ExponentialSleepDurationProvider(exponentialBackOffExponent)); + => new PolicyBuilderWait(this, retryCount, sleepDurationProvider: ExponentialSleepDurationProvider(exponentialBackOffExponent)); /// - IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func sleepDurationProvider, bool backupSkipProvidedIncrements) - => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, Defaults.PolicyOptions.ExponentialBackOffExponent, backupSkipProvidedIncrements)); + IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func sleepDurationProvider) + => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, sleepDurationProvider: (attempt, exception, _) => sleepDurationProvider(attempt, exception) ?? TimeSpan.Zero, sleepDurationsEnumerable: DefaultSleepDurationProvider(Defaults), combineSleepDurations: true); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithExponentialBackoffBackup(Func sleepDurationProvider, bool backupSkipProvidedIncrements) + => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, sleepDurationProvider: ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, Defaults.PolicyOptions.ExponentialBackOffExponent, backupSkipProvidedIncrements)); /// IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithExponentialBackoffBackup(Func sleepDurationProvider, int retryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements) - => new PolicyBuilderWait(this, retryCount, ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, exponentialBackOffExponent, backupSkipProvidedIncrements)); + => new PolicyBuilderWait(this, retryCount, sleepDurationProvider: ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, exponentialBackOffExponent, backupSkipProvidedIncrements)); } private readonly struct PolicyBuilderBase : IPolicyBuilderBase @@ -400,27 +437,35 @@ public PolicyBuilderBase(PolicyBuilder policyBuilder, Defaults defaults /// IPolicyBuilderWait IPolicyBuilderBase.WithRetryPolicyOptionsWait() - => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, default, PolicyBuilderBase.DefaultSleepDurationProvider(Defaults)); + => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, sleepDurationsEnumerable: PolicyBuilderBase.DefaultSleepDurationProvider(Defaults)); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithExponentialBackoffWait() + => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, sleepDurationProviderException: PolicyBuilderBase.DefaultExponentialSleepDurationProvider(Defaults)); /// IPolicyBuilderWait IPolicyBuilderBase.WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func waitDurationProvider) - => new PolicyBuilderWait(this, maxRetryCount, default, (attempt, outcome, _1) => waitDurationProvider(attempt, outcome)); + => new PolicyBuilderWait(this, maxRetryCount, sleepDurationProviderException: (attempt, outcome, _1) => waitDurationProvider(attempt, outcome)); /// IPolicyBuilderWait IPolicyBuilderBase.WithCustomizedRetryPolicyOptionsWait(int maxRetryCount, Func, TimeSpan> sleepDurationProvider) - => new PolicyBuilderWait(this, maxRetryCount, (attempt, outcome, _1) => sleepDurationProvider(attempt, outcome), default); + => new PolicyBuilderWait(this, maxRetryCount, sleepDurationProviderResult: (attempt, outcome, _1) => sleepDurationProvider(attempt, outcome)); /// IPolicyBuilderWait IPolicyBuilderBase.WithExponentialBackoffWait(int maxRetryCount, double exponentialBackOffExponent) - => new PolicyBuilderWait(this, maxRetryCount, default, PolicyBuilderBase.ExponentialSleepDurationProvider(exponentialBackOffExponent)); + => new PolicyBuilderWait(this, maxRetryCount, sleepDurationProviderException: PolicyBuilderBase.ExponentialSleepDurationProvider(exponentialBackOffExponent)); + + /// + IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func sleepDurationProvider) + => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, sleepDurationProviderException: (attempt, exception, _) => sleepDurationProvider(attempt, exception) ?? TimeSpan.Zero, sleepDurationsEnumerable: PolicyBuilderBase.DefaultSleepDurationProvider(Defaults), combineSleepDurations: true); /// - IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithRetryPolicyOptionsBackup(Func sleepDurationProvider, bool backupSkipProvidedIncrements) - => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, default, PolicyBuilderBase.ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, Defaults.PolicyOptions.ExponentialBackOffExponent, backupSkipProvidedIncrements)); + IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithExponentialBackoffBackup(Func sleepDurationProvider, bool backupSkipProvidedIncrements) + => new PolicyBuilderWait(this, Defaults.PolicyOptions.MaxRetryCount, sleepDurationProviderException: PolicyBuilderBase.ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, Defaults.PolicyOptions.ExponentialBackOffExponent, backupSkipProvidedIncrements)); /// IPolicyBuilderWait IPolicyBuilderBase.WithExceptionBasedWaitWithExponentialBackoffBackup(Func sleepDurationProvider, int retryCount, double exponentialBackOffExponent, bool backupSkipProvidedIncrements) - => new PolicyBuilderWait(this, retryCount, default, PolicyBuilderBase.ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, exponentialBackOffExponent, backupSkipProvidedIncrements)); + => new PolicyBuilderWait(this, retryCount, sleepDurationProviderException: PolicyBuilderBase.ExceptionBasedSleepDurationProviderWithExponentialBackoffBackup(sleepDurationProvider, exponentialBackOffExponent, backupSkipProvidedIncrements)); } private readonly struct PolicyBuilderWait : IPolicyBuilderWait diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 9307f4769..5b4b02d66 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -7,7 +7,6 @@ using System.Threading; using System.Threading.Tasks; using Azure.Core; -using Azure.Identity; using Azure.ResourceManager; using Azure.ResourceManager.Resources; using Azure.ResourceManager.Storage; @@ -28,13 +27,8 @@ using TesApi.Web.Storage; using static CommonUtilities.RetryHandler; using BatchProtocol = Microsoft.Azure.Batch.Protocol; -using BlobModels = Azure.Storage.Blobs.Models; using CloudTask = Microsoft.Azure.Batch.CloudTask; -using ComputeNodeState = Microsoft.Azure.Batch.Common.ComputeNodeState; -using JobState = Microsoft.Azure.Batch.Common.JobState; using OnAllTasksComplete = Microsoft.Azure.Batch.Common.OnAllTasksComplete; -using TaskExecutionInformation = Microsoft.Azure.Batch.TaskExecutionInformation; -using TaskState = Microsoft.Azure.Batch.Common.TaskState; namespace TesApi.Web { @@ -91,13 +85,13 @@ public AzureProxy(IOptions batchAccountOptions, BatchAccoun batchRetryPolicyWhenJobNotFound = retryHandler.PolicyBuilder .OpinionatedRetryPolicy(Policy.Handle(ex => BatchErrorCodeStrings.JobNotFound.Equals(ex.RequestInformation.BatchError.Code, StringComparison.OrdinalIgnoreCase))) - .WithExceptionBasedWaitWithRetryPolicyOptionsBackup((attempt, exception) => (exception as BatchException)?.RequestInformation?.RetryAfter, backupSkipProvidedIncrements: true) + .WithExceptionBasedWaitWithRetryPolicyOptionsBackup((attempt, exception) => (exception as BatchException)?.RequestInformation?.RetryAfter) .SetOnRetryBehavior(onRetry: LogRetryErrorOnRetryHandler()) .AsyncBuild(); batchRetryPolicyWhenNodeNotReady = retryHandler.PolicyBuilder .OpinionatedRetryPolicy(Policy.Handle(ex => "NodeNotReady".Equals(ex.RequestInformation.BatchError.Code, StringComparison.OrdinalIgnoreCase))) - .WithExceptionBasedWaitWithRetryPolicyOptionsBackup((attempt, exception) => (exception as BatchException)?.RequestInformation?.RetryAfter, backupSkipProvidedIncrements: true) + .WithExceptionBasedWaitWithRetryPolicyOptionsBackup((attempt, exception) => (exception as BatchException)?.RequestInformation?.RetryAfter) .SetOnRetryBehavior(onRetry: LogRetryErrorOnRetryHandler()) .AsyncBuild(); diff --git a/src/TesApi.Web/CachingWithRetriesBase.cs b/src/TesApi.Web/CachingWithRetriesBase.cs index 02a6550b7..b56dc897e 100644 --- a/src/TesApi.Web/CachingWithRetriesBase.cs +++ b/src/TesApi.Web/CachingWithRetriesBase.cs @@ -51,19 +51,19 @@ protected CachingWithRetriesBase(CachingRetryPolicyBuilder cachingRetryHandler, this.cachingRetry = cachingRetryHandler.PolicyBuilder .OpinionatedRetryPolicy() - .WithExceptionBasedWaitWithRetryPolicyOptionsBackup(sleepDuration, backupSkipProvidedIncrements: true).SetOnRetryBehavior(this.logger).AddCaching().SyncBuild(); + .WithExceptionBasedWaitWithRetryPolicyOptionsBackup(sleepDuration).SetOnRetryBehavior(this.logger).AddCaching().SyncBuild(); this.cachingAsyncRetry = cachingRetryHandler.PolicyBuilder .OpinionatedRetryPolicy() - .WithExceptionBasedWaitWithRetryPolicyOptionsBackup(sleepDuration, backupSkipProvidedIncrements: true).SetOnRetryBehavior(this.logger).AddCaching().AsyncBuild(); + .WithExceptionBasedWaitWithRetryPolicyOptionsBackup(sleepDuration).SetOnRetryBehavior(this.logger).AddCaching().AsyncBuild(); this.cachingAsyncRetryExceptWhenExists = cachingRetryHandler.PolicyBuilder .OpinionatedRetryPolicy(Polly.Policy.Handle(ex => !CreationErrorFoundCodes.Contains(ex.RequestInformation?.BatchError?.Code, StringComparer.OrdinalIgnoreCase))) - .WithExceptionBasedWaitWithRetryPolicyOptionsBackup(sleepDuration, backupSkipProvidedIncrements: true).SetOnRetryBehavior(this.logger).AddCaching().AsyncBuild(); + .WithExceptionBasedWaitWithRetryPolicyOptionsBackup(sleepDuration).SetOnRetryBehavior(this.logger).AddCaching().AsyncBuild(); this.cachingAsyncRetryExceptWhenNotFound = cachingRetryHandler.PolicyBuilder .OpinionatedRetryPolicy(Polly.Policy.Handle(ex => !DeletionErrorFoundCodes.Contains(ex.RequestInformation?.BatchError?.Code, StringComparer.OrdinalIgnoreCase))) - .WithExceptionBasedWaitWithRetryPolicyOptionsBackup(sleepDuration, backupSkipProvidedIncrements: true).SetOnRetryBehavior(this.logger).AddCaching().AsyncBuild(); + .WithExceptionBasedWaitWithRetryPolicyOptionsBackup(sleepDuration).SetOnRetryBehavior(this.logger).AddCaching().AsyncBuild(); } private static readonly string[] CreationErrorFoundCodes = From ca978bd4cabb75b3e22e4e210227195671fead4b Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 18 Oct 2024 15:38:44 -0700 Subject: [PATCH 157/202] Consolidate restart handlers and exit services when startup fails --- Microsoft.GA4GH.TES.sln | 6 ++ .../PagedInterfaceExtensions.cs | 4 +- src/CommonUtilities/RetryHandler.cs | 17 +++++- src/CommonUtilities/UtilityExtensions.cs | 14 ++--- .../AzureBatchSkuVerifier.cs | 36 ++++++------ .../DatabaseOverloadedException.cs | 0 .../Repository => Tes.Repository}/ICache.cs | 0 .../IRepository.cs | 0 .../20230106185229_InitialCreate.Designer.cs | 0 .../20230106185229_InitialCreate.cs | 0 ...0230320202549_AddIndicesToJson.Designer.cs | 0 .../20230320202549_AddIndicesToJson.cs | 0 .../20230808235207_AddGinIndex.Designer.cs | 0 .../Migrations/20230808235207_AddGinIndex.cs | 0 .../Migrations/TesDbContextModelSnapshot.cs | 0 .../Models/PostgreSqlOptions.cs | 0 .../Models/TesTaskPostgres.cs | 0 .../PostgreSqlCachingRepository.cs | 15 +++-- .../RepositoryCollisionException.cs | 0 src/Tes.Repository/Tes.Repository.csproj | 31 ++++++++++ .../TesDbContext.cs | 0 .../TesRepositoryCache.cs | 0 .../TesTaskPostgreSqlRepository.cs | 2 +- .../ExpressionParameterSubstitute.cs | 0 .../PostgresConnectionStringUtility.cs | 0 .../Utilities/PrependableFormattableString.cs | 0 .../Authentication/CredentialsManager.cs | 24 +++----- src/Tes.Runner/Transfer/BlobApiHttpUtils.cs | 10 ++-- .../Transfer/HttpRetryPolicyDefinition.cs | 23 +++----- src/Tes.SDK/Tes.SDK.csproj | 6 +- src/Tes/Extensions/TesTaskExtensions.cs | 26 --------- src/Tes/Repository/RepositoryItem.cs | 2 +- src/Tes/Tes.csproj | 15 +---- src/TesApi.Web/AzureProxy.cs | 5 +- .../Management/ArmBatchQuotaProvider.cs | 1 - .../OrchestrateOnBatchSchedulerServiceBase.cs | 3 +- src/TesApi.Web/TaskScheduler.cs | 20 ++++++- src/TesApi.Web/TesApi.Web.csproj | 3 +- src/deploy-tes-on-azure/Deployer.cs | 57 ++++++++++--------- src/deploy-tes-on-azure/KubernetesManager.cs | 27 +++++---- 40 files changed, 187 insertions(+), 160 deletions(-) rename src/{Tes/Repository => Tes.Repository}/DatabaseOverloadedException.cs (100%) rename src/{Tes/Repository => Tes.Repository}/ICache.cs (100%) rename src/{Tes/Repository => Tes.Repository}/IRepository.cs (100%) rename src/{Tes => Tes.Repository}/Migrations/20230106185229_InitialCreate.Designer.cs (100%) rename src/{Tes => Tes.Repository}/Migrations/20230106185229_InitialCreate.cs (100%) rename src/{Tes => Tes.Repository}/Migrations/20230320202549_AddIndicesToJson.Designer.cs (100%) rename src/{Tes => Tes.Repository}/Migrations/20230320202549_AddIndicesToJson.cs (100%) rename src/{Tes => Tes.Repository}/Migrations/20230808235207_AddGinIndex.Designer.cs (100%) rename src/{Tes => Tes.Repository}/Migrations/20230808235207_AddGinIndex.cs (100%) rename src/{Tes => Tes.Repository}/Migrations/TesDbContextModelSnapshot.cs (100%) rename src/{Tes => Tes.Repository}/Models/PostgreSqlOptions.cs (100%) rename src/{Tes => Tes.Repository}/Models/TesTaskPostgres.cs (100%) rename src/{Tes/Repository => Tes.Repository}/PostgreSqlCachingRepository.cs (94%) rename src/{Tes/Repository => Tes.Repository}/RepositoryCollisionException.cs (100%) create mode 100644 src/Tes.Repository/Tes.Repository.csproj rename src/{Tes/Repository => Tes.Repository}/TesDbContext.cs (100%) rename src/{Tes/Repository => Tes.Repository}/TesRepositoryCache.cs (100%) rename src/{Tes/Repository => Tes.Repository}/TesTaskPostgreSqlRepository.cs (99%) rename src/{Tes => Tes.Repository}/Utilities/ExpressionParameterSubstitute.cs (100%) rename src/{Tes => Tes.Repository}/Utilities/PostgresConnectionStringUtility.cs (100%) rename src/{Tes => Tes.Repository}/Utilities/PrependableFormattableString.cs (100%) diff --git a/Microsoft.GA4GH.TES.sln b/Microsoft.GA4GH.TES.sln index fe33fbb7f..f0ed3aef1 100644 --- a/Microsoft.GA4GH.TES.sln +++ b/Microsoft.GA4GH.TES.sln @@ -49,6 +49,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tes.SDK", "src\Tes.SDK\Tes. EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tes.SDK.Tests", "src\Tes.SDK.Tests\Tes.SDK.Tests.csproj", "{AE7ADB92-BEC6-4030-B62F-BDBB6AC53CB4}" EndProject +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tes.Repository", "src\Tes.Repository\Tes.Repository.csproj", "{515A4905-0522-4C72-BC18-41BE6A3BE880}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -115,6 +117,10 @@ Global {AE7ADB92-BEC6-4030-B62F-BDBB6AC53CB4}.Debug|Any CPU.Build.0 = Debug|Any CPU {AE7ADB92-BEC6-4030-B62F-BDBB6AC53CB4}.Release|Any CPU.ActiveCfg = Release|Any CPU {AE7ADB92-BEC6-4030-B62F-BDBB6AC53CB4}.Release|Any CPU.Build.0 = Release|Any CPU + {515A4905-0522-4C72-BC18-41BE6A3BE880}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {515A4905-0522-4C72-BC18-41BE6A3BE880}.Debug|Any CPU.Build.0 = Debug|Any CPU + {515A4905-0522-4C72-BC18-41BE6A3BE880}.Release|Any CPU.ActiveCfg = Release|Any CPU + {515A4905-0522-4C72-BC18-41BE6A3BE880}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/src/CommonUtilities/PagedInterfaceExtensions.cs b/src/CommonUtilities/PagedInterfaceExtensions.cs index 3230bb56a..2042211bc 100644 --- a/src/CommonUtilities/PagedInterfaceExtensions.cs +++ b/src/CommonUtilities/PagedInterfaceExtensions.cs @@ -69,10 +69,10 @@ public static IAsyncEnumerable ExecuteWithRetryAsync(this AsyncRetryHandle private sealed class PollyAsyncEnumerable : IAsyncEnumerable { private readonly IAsyncEnumerable _source; - private readonly RetryHandler.AsyncRetryHandlerPolicy _retryPolicy; + private readonly AsyncRetryHandlerPolicy _retryPolicy; private readonly Polly.Context _ctx; - public PollyAsyncEnumerable(IAsyncEnumerable source, RetryHandler.AsyncRetryHandlerPolicy retryPolicy, Polly.Context ctx) + public PollyAsyncEnumerable(IAsyncEnumerable source, AsyncRetryHandlerPolicy retryPolicy, Polly.Context ctx) { ArgumentNullException.ThrowIfNull(source); ArgumentNullException.ThrowIfNull(retryPolicy); diff --git a/src/CommonUtilities/RetryHandler.cs b/src/CommonUtilities/RetryHandler.cs index 9f1dd821c..0d02ba7fa 100644 --- a/src/CommonUtilities/RetryHandler.cs +++ b/src/CommonUtilities/RetryHandler.cs @@ -105,7 +105,6 @@ public AsyncRetryHandlerPolicy(IAsyncPolicy retryPolicy) /// For mocking public AsyncRetryHandlerPolicy() { } - /// /// Executes a delegate with the configured async policy. /// @@ -161,6 +160,20 @@ public virtual Task ExecuteWithRetryAsync(Func action(ct), PrepareContext(caller), cancellationToken); } + + /// + /// Executes the specified asynchronous action within the policy and returns the captured result. + /// + /// The action to perform. + /// A cancellation token which can be used to cancel the action. When a retry policy in use, also cancels any further retries. + /// Name of method originating the retriable operation. + /// The captured result. + public virtual Task ExecuteAndCaptureAsync(Func action, CancellationToken cancellationToken, [System.Runtime.CompilerServices.CallerMemberName] string? caller = default) + { + ArgumentNullException.ThrowIfNull(action); + + return retryPolicy.ExecuteAndCaptureAsync((_, token) => action(token), PrepareContext(caller), cancellationToken); + } } /// @@ -282,6 +295,6 @@ public virtual async Task ExecuteWithRetryAndConversionAsync(Func new() { - [CallerMemberNameKey] = caller + [CallerMemberNameKey] = caller ?? throw new ArgumentNullException(nameof(caller)) }; } diff --git a/src/CommonUtilities/UtilityExtensions.cs b/src/CommonUtilities/UtilityExtensions.cs index 4c3378c2f..91f089c34 100644 --- a/src/CommonUtilities/UtilityExtensions.cs +++ b/src/CommonUtilities/UtilityExtensions.cs @@ -105,13 +105,13 @@ public static void ForEach(this IEnumerable values, Action action) #endregion #region AddRange - //public static void AddRange(this IList list, IEnumerable values) - //{ - // foreach (var value in values) - // { - // list.Add(value); - // }; - //} + public static void AddRange(this IList list, IEnumerable values) + { + foreach (var value in values) + { + list.Add(value); + }; + } public static void AddRange(this IDictionary dictionary, IDictionary values) { diff --git a/src/GenerateBatchVmSkus/AzureBatchSkuVerifier.cs b/src/GenerateBatchVmSkus/AzureBatchSkuVerifier.cs index a11082345..0158fc0b4 100644 --- a/src/GenerateBatchVmSkus/AzureBatchSkuVerifier.cs +++ b/src/GenerateBatchVmSkus/AzureBatchSkuVerifier.cs @@ -7,11 +7,11 @@ using System.Text.RegularExpressions; using System.Threading.Channels; using Azure.Core; +using CommonUtilities; using Microsoft.Azure.Batch; using Microsoft.Azure.Batch.Common; -using Polly; -using Polly.Retry; using Tes.Models; +using static CommonUtilities.RetryHandler; using static GenerateBatchVmSkus.Program; /* @@ -129,9 +129,11 @@ async ValueTask GetResults(IAsyncEnumerable res } } - private static readonly AsyncRetryPolicy asyncRetryPolicy = Policy - .Handle() - .WaitAndRetryForeverAsync(i => TimeSpan.FromSeconds(0.05)); + private static readonly AsyncRetryHandlerPolicy asyncRetryPolicy = new RetryPolicyBuilder(Microsoft.Extensions.Options.Options.Create(new CommonUtilities.Options.RetryPolicyOptions())) + .PolicyBuilder.OpinionatedRetryPolicy() + .WithCustomizedRetryPolicyOptionsWait(int.MaxValue, (_, _) => TimeSpan.FromSeconds(0.05)) + .SetOnRetryBehavior() + .AsyncBuild(); private static IDictionary? batchSkus; @@ -225,12 +227,12 @@ private async ValueTask> GetVmSkusAsync(TestContext co if (CanBatchAccountValidateSku(vm, context)) { result = result.Append(vm); - await asyncRetryPolicy.ExecuteAsync(WriteLog("sort", "process", vm), cancellationToken); + await asyncRetryPolicy.ExecuteWithRetryAsync(WriteLog("sort", "process", vm), cancellationToken); } else { await resultSkus.Writer.WriteAsync(vm, cancellationToken); - await asyncRetryPolicy.ExecuteAsync(WriteLog("sort", "forward", vm), cancellationToken); + await asyncRetryPolicy.ExecuteWithRetryAsync(WriteLog("sort", "forward", vm), cancellationToken); } } @@ -318,7 +320,7 @@ private async ValueTask ValidateSkus(CancellationToken cancellationToken) var StartLoadedTest = new Func>(async vmSize => { _ = Interlocked.Increment(ref started); - await asyncRetryPolicy.ExecuteAsync(WriteLog("process", "post", vmSize), cancellationToken); + await asyncRetryPolicy.ExecuteWithRetryAsync(WriteLog("process", "post", vmSize), cancellationToken); return (vmSize, result: await TestVMSizeInBatchAsync(vmSize, cancellationToken)); }); @@ -337,7 +339,7 @@ private async ValueTask ValidateSkus(CancellationToken cancellationToken) { List skusToTest = new(await GetVmSkusAsync(context, cancellationToken)); await skusToTest.ToAsyncEnumerable() - .ForEachAwaitWithCancellationAsync(async (sku, token) => await asyncRetryPolicy.ExecuteAsync(WriteLog("process", "queue", sku), token), cancellationToken); + .ForEachAwaitWithCancellationAsync(async (sku, token) => await asyncRetryPolicy.ExecuteWithRetryAsync(WriteLog("process", "queue", sku), token), cancellationToken); var loadedTests = skusToTest.Where(CanTestNow).ToList(); for (tests = loadedTests.Select(StartLoadedTest).ToList(); @@ -369,20 +371,20 @@ await skusToTest.ToAsyncEnumerable() _ = retries.Remove(vmSize.VmSku.Name); vmSize.Validated = true; await resultSkus.Writer.WriteAsync(vmSize, cancellationToken); - await asyncRetryPolicy.ExecuteAsync(WriteLog("process", "use", vmSize), cancellationToken); + await asyncRetryPolicy.ExecuteWithRetryAsync(WriteLog("process", "use", vmSize), cancellationToken); break; case VerifyVMIResult.Skip: ++processed; _ = retries.Remove(vmSize.VmSku.Name); - await asyncRetryPolicy.ExecuteAsync(WriteLog("process", "skip", vmSize), cancellationToken); + await asyncRetryPolicy.ExecuteWithRetryAsync(WriteLog("process", "skip", vmSize), cancellationToken); break; case VerifyVMIResult.NextRegion: ++processedDeferred; _ = retries.Remove(vmSize.VmSku.Name); await resultSkus.Writer.WriteAsync(vmSize, cancellationToken); - await asyncRetryPolicy.ExecuteAsync(WriteLog("process", "forward", vmSize), cancellationToken); + await asyncRetryPolicy.ExecuteWithRetryAsync(WriteLog("process", "forward", vmSize), cancellationToken); break; case VerifyVMIResult.Retry: @@ -392,7 +394,7 @@ await skusToTest.ToAsyncEnumerable() retries[vmSize.VmSku.Name] = (false, lastRetry.RetryCount + 1, DateTime.UtcNow + AzureBatchSkuValidator.RetryWaitTime, vmSize); _ = Interlocked.Decrement(ref started); _ = Interlocked.Decrement(ref completed); - await asyncRetryPolicy.ExecuteAsync(WriteLog("process", $"wait{lastRetry.RetryCount}", vmSize), cancellationToken); + await asyncRetryPolicy.ExecuteWithRetryAsync(WriteLog("process", $"wait{lastRetry.RetryCount}", vmSize), cancellationToken); } else { @@ -405,7 +407,7 @@ await skusToTest.ToAsyncEnumerable() ++processed; _ = retries.Remove(vmSize.VmSku.Name); - await asyncRetryPolicy.ExecuteAsync(WriteLog("process", "skipRT", vmSize), cancellationToken); + await asyncRetryPolicy.ExecuteWithRetryAsync(WriteLog("process", "skipRT", vmSize), cancellationToken); } break; @@ -432,7 +434,7 @@ await skusToTest.ToAsyncEnumerable() skusToTest.AddRange(await (await GetVmSkusAsync(context, cancellationToken)).ToAsyncEnumerable() .WhereAwaitWithCancellation(async (vmSize, token) => { - await asyncRetryPolicy.ExecuteAsync(WriteLog("process", "queue", vmSize), cancellationToken); + await asyncRetryPolicy.ExecuteWithRetryAsync(WriteLog("process", "queue", vmSize), cancellationToken); return true; }) .ToListAsync(cancellationToken)); @@ -464,7 +466,7 @@ await skusToTest.ToAsyncEnumerable() .ToAsyncEnumerable() .WhereAwaitWithCancellation(async (vmSize, token) => { - await asyncRetryPolicy.ExecuteAsync(WriteLog("process", "queueRT", vmSize), cancellationToken); + await asyncRetryPolicy.ExecuteWithRetryAsync(WriteLog("process", "queueRT", vmSize), cancellationToken); return true; }) .ToListAsync(cancellationToken)); @@ -515,7 +517,7 @@ await skusToTest.ToAsyncEnumerable() await skusToTest.ToAsyncEnumerable().ForEachAwaitWithCancellationAsync(async (vmSize, token) => { await resultSkus.Writer.WriteAsync(vmSize, token); - await asyncRetryPolicy.ExecuteAsync(WriteLog("process", "dump", vmSize), cancellationToken); + await asyncRetryPolicy.ExecuteWithRetryAsync(WriteLog("process", "dump", vmSize), cancellationToken); lock (consoleLock) { SetForegroundColor(ConsoleColor.Yellow); diff --git a/src/Tes/Repository/DatabaseOverloadedException.cs b/src/Tes.Repository/DatabaseOverloadedException.cs similarity index 100% rename from src/Tes/Repository/DatabaseOverloadedException.cs rename to src/Tes.Repository/DatabaseOverloadedException.cs diff --git a/src/Tes/Repository/ICache.cs b/src/Tes.Repository/ICache.cs similarity index 100% rename from src/Tes/Repository/ICache.cs rename to src/Tes.Repository/ICache.cs diff --git a/src/Tes/Repository/IRepository.cs b/src/Tes.Repository/IRepository.cs similarity index 100% rename from src/Tes/Repository/IRepository.cs rename to src/Tes.Repository/IRepository.cs diff --git a/src/Tes/Migrations/20230106185229_InitialCreate.Designer.cs b/src/Tes.Repository/Migrations/20230106185229_InitialCreate.Designer.cs similarity index 100% rename from src/Tes/Migrations/20230106185229_InitialCreate.Designer.cs rename to src/Tes.Repository/Migrations/20230106185229_InitialCreate.Designer.cs diff --git a/src/Tes/Migrations/20230106185229_InitialCreate.cs b/src/Tes.Repository/Migrations/20230106185229_InitialCreate.cs similarity index 100% rename from src/Tes/Migrations/20230106185229_InitialCreate.cs rename to src/Tes.Repository/Migrations/20230106185229_InitialCreate.cs diff --git a/src/Tes/Migrations/20230320202549_AddIndicesToJson.Designer.cs b/src/Tes.Repository/Migrations/20230320202549_AddIndicesToJson.Designer.cs similarity index 100% rename from src/Tes/Migrations/20230320202549_AddIndicesToJson.Designer.cs rename to src/Tes.Repository/Migrations/20230320202549_AddIndicesToJson.Designer.cs diff --git a/src/Tes/Migrations/20230320202549_AddIndicesToJson.cs b/src/Tes.Repository/Migrations/20230320202549_AddIndicesToJson.cs similarity index 100% rename from src/Tes/Migrations/20230320202549_AddIndicesToJson.cs rename to src/Tes.Repository/Migrations/20230320202549_AddIndicesToJson.cs diff --git a/src/Tes/Migrations/20230808235207_AddGinIndex.Designer.cs b/src/Tes.Repository/Migrations/20230808235207_AddGinIndex.Designer.cs similarity index 100% rename from src/Tes/Migrations/20230808235207_AddGinIndex.Designer.cs rename to src/Tes.Repository/Migrations/20230808235207_AddGinIndex.Designer.cs diff --git a/src/Tes/Migrations/20230808235207_AddGinIndex.cs b/src/Tes.Repository/Migrations/20230808235207_AddGinIndex.cs similarity index 100% rename from src/Tes/Migrations/20230808235207_AddGinIndex.cs rename to src/Tes.Repository/Migrations/20230808235207_AddGinIndex.cs diff --git a/src/Tes/Migrations/TesDbContextModelSnapshot.cs b/src/Tes.Repository/Migrations/TesDbContextModelSnapshot.cs similarity index 100% rename from src/Tes/Migrations/TesDbContextModelSnapshot.cs rename to src/Tes.Repository/Migrations/TesDbContextModelSnapshot.cs diff --git a/src/Tes/Models/PostgreSqlOptions.cs b/src/Tes.Repository/Models/PostgreSqlOptions.cs similarity index 100% rename from src/Tes/Models/PostgreSqlOptions.cs rename to src/Tes.Repository/Models/PostgreSqlOptions.cs diff --git a/src/Tes/Models/TesTaskPostgres.cs b/src/Tes.Repository/Models/TesTaskPostgres.cs similarity index 100% rename from src/Tes/Models/TesTaskPostgres.cs rename to src/Tes.Repository/Models/TesTaskPostgres.cs diff --git a/src/Tes/Repository/PostgreSqlCachingRepository.cs b/src/Tes.Repository/PostgreSqlCachingRepository.cs similarity index 94% rename from src/Tes/Repository/PostgreSqlCachingRepository.cs rename to src/Tes.Repository/PostgreSqlCachingRepository.cs index 660d3d7a2..f83e740f2 100644 --- a/src/Tes/Repository/PostgreSqlCachingRepository.cs +++ b/src/Tes.Repository/PostgreSqlCachingRepository.cs @@ -9,9 +9,10 @@ using System.Threading; using System.Threading.Channels; using System.Threading.Tasks; +using CommonUtilities; using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.Logging; -using Polly; +using static CommonUtilities.RetryHandler; namespace Tes.Repository { @@ -25,9 +26,11 @@ public abstract class PostgreSqlCachingRepository : IDisposable private const int BatchSize = 1000; private static readonly TimeSpan defaultCompletedTaskCacheExpiration = TimeSpan.FromDays(1); - protected readonly AsyncPolicy asyncPolicy = Policy - .Handle(e => e.IsTransient) - .WaitAndRetryAsync(10, i => TimeSpan.FromSeconds(Math.Pow(2, i))); + protected readonly AsyncRetryHandlerPolicy asyncPolicy = new RetryPolicyBuilder(Microsoft.Extensions.Options.Options.Create(new CommonUtilities.Options.RetryPolicyOptions() { ExponentialBackOffExponent = 2, MaxRetryCount = 10 })) + .PolicyBuilder.OpinionatedRetryPolicy(Polly.Policy.Handle(e => e.IsTransient)) + .WithRetryPolicyOptionsWait() + .SetOnRetryBehavior() + .AsyncBuild(); private record struct WriteItem(TDbItem DbItem, WriteAction Action, TaskCompletionSource TaskSource); private readonly Channel itemsToWrite = Channel.CreateUnbounded(new() { SingleReader = true }); @@ -135,7 +138,7 @@ protected async Task> GetItemsAsync(DbSet dbSet, C //var sqlQuery = query.ToQueryString(); //System.Diagnostics.Debugger.Break(); - return await asyncPolicy.ExecuteAsync(query.ToListAsync, cancellationToken); + return await asyncPolicy.ExecuteWithRetryAsync(query.ToListAsync, cancellationToken); } /// @@ -220,7 +223,7 @@ private async ValueTask WriteItemsAsync(IList dbItems, CancellationTo dbContext.AddRange(dbItems.Where(e => WriteAction.Add.Equals(e.Action)).Select(e => e.DbItem)); dbContext.UpdateRange(dbItems.Where(e => WriteAction.Update.Equals(e.Action)).Select(e => e.DbItem)); dbContext.RemoveRange(dbItems.Where(e => WriteAction.Delete.Equals(e.Action)).Select(e => e.DbItem)); - await asyncPolicy.ExecuteAsync(dbContext.SaveChangesAsync, cancellationToken); + await asyncPolicy.ExecuteWithRetryAsync(dbContext.SaveChangesAsync, cancellationToken); OperateOnAll(dbItems, ActionOnSuccess()); } catch (Exception ex) diff --git a/src/Tes/Repository/RepositoryCollisionException.cs b/src/Tes.Repository/RepositoryCollisionException.cs similarity index 100% rename from src/Tes/Repository/RepositoryCollisionException.cs rename to src/Tes.Repository/RepositoryCollisionException.cs diff --git a/src/Tes.Repository/Tes.Repository.csproj b/src/Tes.Repository/Tes.Repository.csproj new file mode 100644 index 000000000..1df4830ea --- /dev/null +++ b/src/Tes.Repository/Tes.Repository.csproj @@ -0,0 +1,31 @@ + + + + net8.0 + $(Product) TES repository library + + + + + <_Parameter1>TesApi.Tests + + + + + + + all + runtime; build; native; contentfiles; analyzers; buildtransitive + + + + + + + + + + + + + diff --git a/src/Tes/Repository/TesDbContext.cs b/src/Tes.Repository/TesDbContext.cs similarity index 100% rename from src/Tes/Repository/TesDbContext.cs rename to src/Tes.Repository/TesDbContext.cs diff --git a/src/Tes/Repository/TesRepositoryCache.cs b/src/Tes.Repository/TesRepositoryCache.cs similarity index 100% rename from src/Tes/Repository/TesRepositoryCache.cs rename to src/Tes.Repository/TesRepositoryCache.cs diff --git a/src/Tes/Repository/TesTaskPostgreSqlRepository.cs b/src/Tes.Repository/TesTaskPostgreSqlRepository.cs similarity index 99% rename from src/Tes/Repository/TesTaskPostgreSqlRepository.cs rename to src/Tes.Repository/TesTaskPostgreSqlRepository.cs index 0be0c82e1..b6ff045cb 100644 --- a/src/Tes/Repository/TesTaskPostgreSqlRepository.cs +++ b/src/Tes.Repository/TesTaskPostgreSqlRepository.cs @@ -270,7 +270,7 @@ private async Task GetItemFromCacheOrDatabase(string id, bo using var dbContext = CreateDbContext(); // Search for Id within the JSON - item = await ExecuteNpgsqlActionAsync(async () => await asyncPolicy.ExecuteAsync(ct => dbContext.TesTasks.FirstOrDefaultAsync(t => t.Json.Id == id, ct), cancellationToken)); + item = await ExecuteNpgsqlActionAsync(async () => await asyncPolicy.ExecuteWithRetryAsync(ct => dbContext.TesTasks.FirstOrDefaultAsync(t => t.Json.Id == id, ct), cancellationToken)); if (throwIfNotFound && item is null) { diff --git a/src/Tes/Utilities/ExpressionParameterSubstitute.cs b/src/Tes.Repository/Utilities/ExpressionParameterSubstitute.cs similarity index 100% rename from src/Tes/Utilities/ExpressionParameterSubstitute.cs rename to src/Tes.Repository/Utilities/ExpressionParameterSubstitute.cs diff --git a/src/Tes/Utilities/PostgresConnectionStringUtility.cs b/src/Tes.Repository/Utilities/PostgresConnectionStringUtility.cs similarity index 100% rename from src/Tes/Utilities/PostgresConnectionStringUtility.cs rename to src/Tes.Repository/Utilities/PostgresConnectionStringUtility.cs diff --git a/src/Tes/Utilities/PrependableFormattableString.cs b/src/Tes.Repository/Utilities/PrependableFormattableString.cs similarity index 100% rename from src/Tes/Utilities/PrependableFormattableString.cs rename to src/Tes.Repository/Utilities/PrependableFormattableString.cs diff --git a/src/Tes.Runner/Authentication/CredentialsManager.cs b/src/Tes.Runner/Authentication/CredentialsManager.cs index 31c06c9f7..b4c87dcb8 100644 --- a/src/Tes.Runner/Authentication/CredentialsManager.cs +++ b/src/Tes.Runner/Authentication/CredentialsManager.cs @@ -3,12 +3,12 @@ using Azure.Core; using Azure.Identity; +using CommonUtilities; using Microsoft.Extensions.Logging; -using Polly; -using Polly.Retry; using Tes.Runner.Exceptions; using Tes.Runner.Models; using Tes.Runner.Transfer; +using static CommonUtilities.RetryHandler; namespace Tes.Runner.Authentication { @@ -16,24 +16,16 @@ public class CredentialsManager { private readonly ILogger logger = PipelineLoggerFactory.Create(); - private readonly RetryPolicy retryPolicy; + private readonly RetryHandlerPolicy retryPolicy; private const int MaxRetryCount = 7; private const int ExponentialBackOffExponent = 2; public CredentialsManager() { - retryPolicy = Policy - .Handle() - .WaitAndRetry(MaxRetryCount, - SleepDurationHandler); - } - - private TimeSpan SleepDurationHandler(int attempt) - { - logger.LogInformation("Attempt {Attempt} to get token credential", attempt); - var duration = TimeSpan.FromSeconds(Math.Pow(ExponentialBackOffExponent, attempt)); - logger.LogInformation("Waiting {Duration} before retrying", duration); - return duration; + retryPolicy = new RetryPolicyBuilder(Microsoft.Extensions.Options.Options.Create(new CommonUtilities.Options.RetryPolicyOptions() { ExponentialBackOffExponent = ExponentialBackOffExponent, MaxRetryCount = MaxRetryCount })) + .DefaultRetryPolicyBuilder() + .SetOnRetryBehavior(logger) + .SyncBuild(); } public virtual TokenCredential GetTokenCredential(RuntimeOptions runtimeOptions, string? tokenScope = default) @@ -56,7 +48,7 @@ public virtual TokenCredential GetTokenCredential(RuntimeOptions runtimeOptions, tokenScope ??= runtimeOptions.AzureEnvironmentConfig!.TokenScope!; try { - return retryPolicy.Execute(() => GetTokenCredentialImpl(managedIdentityResourceId, tokenScope, runtimeOptions.AzureEnvironmentConfig!.AzureAuthorityHostUrl!)); + return retryPolicy.ExecuteWithRetry(() => GetTokenCredentialImpl(managedIdentityResourceId, tokenScope, runtimeOptions.AzureEnvironmentConfig!.AzureAuthorityHostUrl!)); } catch { diff --git a/src/Tes.Runner/Transfer/BlobApiHttpUtils.cs b/src/Tes.Runner/Transfer/BlobApiHttpUtils.cs index 0a8808cb1..b578215c3 100644 --- a/src/Tes.Runner/Transfer/BlobApiHttpUtils.cs +++ b/src/Tes.Runner/Transfer/BlobApiHttpUtils.cs @@ -7,13 +7,13 @@ using System.Text; using Azure.Storage.Blobs; using Microsoft.Extensions.Logging; -using Polly.Retry; +using static CommonUtilities.RetryHandler; namespace Tes.Runner.Transfer; /// /// A class containing the logic to create and make the HTTP requests for the blob block API. /// -public class BlobApiHttpUtils(HttpClient httpClient, AsyncRetryPolicy retryPolicy) +public class BlobApiHttpUtils(HttpClient httpClient, AsyncRetryHandlerPolicy retryPolicy) { //https://learn.microsoft.com/en-us/rest/api/storageservices/understanding-block-blobs--append-blobs--and-page-blobs public const string DefaultApiVersion = "2023-05-03"; @@ -22,7 +22,7 @@ public class BlobApiHttpUtils(HttpClient httpClient, AsyncRetryPolicy retryPolic private readonly HttpClient httpClient = httpClient; private static readonly ILogger Logger = PipelineLoggerFactory.Create(); - private readonly AsyncRetryPolicy retryPolicy = retryPolicy; + private readonly AsyncRetryHandlerPolicy retryPolicy = retryPolicy; public const string RootHashMetadataName = "md5_4mib_hashlist_root_hash"; @@ -167,7 +167,7 @@ public static HttpRequestMessage CreateBlobBlockListRequest(long length, Uri blo public async Task ExecuteHttpRequestAsync(Func requestFactory, CancellationToken cancellationToken = default) { - return await retryPolicy.ExecuteAsync(ct => ExecuteHttpRequestImplAsync(requestFactory, ct), cancellationToken); + return await retryPolicy.ExecuteWithRetryAsync(ct => ExecuteHttpRequestImplAsync(requestFactory, ct), cancellationToken); } public static bool UrlContainsSasToken(string sourceUrl) @@ -239,7 +239,7 @@ private static void HandleHttpRequestException(HttpStatusCode? status, HttpReque public async Task ExecuteHttpRequestAndReadBodyResponseAsync(PipelineBuffer buffer, Func requestFactory, CancellationToken cancellationToken = default) { - return await retryPolicy.ExecuteAsync(ct => ExecuteHttpRequestAndReadBodyResponseImplAsync(buffer, requestFactory, ct), cancellationToken); + return await retryPolicy.ExecuteWithRetryAsync(ct => ExecuteHttpRequestAndReadBodyResponseImplAsync(buffer, requestFactory, ct), cancellationToken); } private static bool ContainsRetriableException(Exception? ex) diff --git a/src/Tes.Runner/Transfer/HttpRetryPolicyDefinition.cs b/src/Tes.Runner/Transfer/HttpRetryPolicyDefinition.cs index 8dc051e91..b733b6331 100644 --- a/src/Tes.Runner/Transfer/HttpRetryPolicyDefinition.cs +++ b/src/Tes.Runner/Transfer/HttpRetryPolicyDefinition.cs @@ -1,9 +1,9 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +using CommonUtilities; using Microsoft.Extensions.Logging; -using Polly; -using Polly.Retry; +using static CommonUtilities.RetryHandler; namespace Tes.Runner.Transfer { @@ -13,20 +13,13 @@ public class HttpRetryPolicyDefinition public const int RetryExponent = 2; private static readonly ILogger Logger = PipelineLoggerFactory.Create(); - public static AsyncRetryPolicy DefaultAsyncRetryPolicy(int maxRetryCount = DefaultMaxRetryCount) + public static AsyncRetryHandlerPolicy DefaultAsyncRetryPolicy(int maxRetryCount = DefaultMaxRetryCount) { - return Policy - .Handle() - .WaitAndRetryAsync(maxRetryCount, retryAttempt => - { - return TimeSpan.FromSeconds(Math.Pow(RetryExponent, retryAttempt)); - }, - onRetryAsync: - (exception, _, retryCount, _) => - { - Logger.LogError(exception, "Retrying failed request. Retry count: {retryCount}", retryCount); - return Task.CompletedTask; - }); + return new RetryPolicyBuilder(Microsoft.Extensions.Options.Options.Create(new CommonUtilities.Options.RetryPolicyOptions() { MaxRetryCount = maxRetryCount, ExponentialBackOffExponent = RetryExponent })) + .PolicyBuilder.OpinionatedRetryPolicy(Polly.Policy.Handle()) + .WithRetryPolicyOptionsWait() + .SetOnRetryBehavior(Logger) + .AsyncBuild(); } } } diff --git a/src/Tes.SDK/Tes.SDK.csproj b/src/Tes.SDK/Tes.SDK.csproj index 6e8ccbd47..30f5ea9fa 100644 --- a/src/Tes.SDK/Tes.SDK.csproj +++ b/src/Tes.SDK/Tes.SDK.csproj @@ -8,7 +8,11 @@ - + + + + + diff --git a/src/Tes/Extensions/TesTaskExtensions.cs b/src/Tes/Extensions/TesTaskExtensions.cs index 4e5a7d009..4886e88f0 100644 --- a/src/Tes/Extensions/TesTaskExtensions.cs +++ b/src/Tes/Extensions/TesTaskExtensions.cs @@ -1,7 +1,6 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -using System; using System.Collections.Generic; using System.Linq; using Tes.Models; @@ -34,31 +33,6 @@ public static CromwellTaskSubmitter GetCromwellMetadata(this TesTask tesTask) return tesTask.TaskSubmitter as CromwellTaskSubmitter; } - /// - /// Visits each value in an enumeration with an action. - /// - /// Type of enumerated items. - /// Enumeration on which to visit each item. - /// Action to invoke with each item. - internal static void ForEach(this IEnumerable values, Action action) - { - foreach (var value in values) - { - action(value); - } - } - - /// - /// Adds a range of items to an . - /// - /// Type of enumerated items. - /// List to add to. - /// Items to add to . - public static void AddRange(this IList list, IEnumerable items) - { - items.ForEach(list.Add); - } - /// /// Writes to system log. /// diff --git a/src/Tes/Repository/RepositoryItem.cs b/src/Tes/Repository/RepositoryItem.cs index 0fcc747cf..92ceccb98 100644 --- a/src/Tes/Repository/RepositoryItem.cs +++ b/src/Tes/Repository/RepositoryItem.cs @@ -10,7 +10,7 @@ namespace Tes.Repository { /// - /// Base class for items stored in . The type must contain a property named "id". + /// Base class for items stored in IRepository{T}. The type must contain a property named "id". /// /// Type of the item [DataContract] diff --git a/src/Tes/Tes.csproj b/src/Tes/Tes.csproj index df328d9f9..f02e20d5c 100644 --- a/src/Tes/Tes.csproj +++ b/src/Tes/Tes.csproj @@ -2,27 +2,18 @@ net8.0 - $(Product) TES repository library + $(Product) GA4GH TES - + - - - all - runtime; build; native; contentfiles; analyzers; buildtransitive - - - - - diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 5b4b02d66..17511a07b 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -20,7 +20,6 @@ using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using Microsoft.Rest; -using Polly; using TesApi.Web.Extensions; using TesApi.Web.Management; using TesApi.Web.Management.Configuration; @@ -84,13 +83,13 @@ public AzureProxy(IOptions batchAccountOptions, BatchAccoun } batchRetryPolicyWhenJobNotFound = retryHandler.PolicyBuilder - .OpinionatedRetryPolicy(Policy.Handle(ex => BatchErrorCodeStrings.JobNotFound.Equals(ex.RequestInformation.BatchError.Code, StringComparison.OrdinalIgnoreCase))) + .OpinionatedRetryPolicy(Polly.Policy.Handle(ex => BatchErrorCodeStrings.JobNotFound.Equals(ex.RequestInformation.BatchError.Code, StringComparison.OrdinalIgnoreCase))) .WithExceptionBasedWaitWithRetryPolicyOptionsBackup((attempt, exception) => (exception as BatchException)?.RequestInformation?.RetryAfter) .SetOnRetryBehavior(onRetry: LogRetryErrorOnRetryHandler()) .AsyncBuild(); batchRetryPolicyWhenNodeNotReady = retryHandler.PolicyBuilder - .OpinionatedRetryPolicy(Policy.Handle(ex => "NodeNotReady".Equals(ex.RequestInformation.BatchError.Code, StringComparison.OrdinalIgnoreCase))) + .OpinionatedRetryPolicy(Polly.Policy.Handle(ex => "NodeNotReady".Equals(ex.RequestInformation.BatchError.Code, StringComparison.OrdinalIgnoreCase))) .WithExceptionBasedWaitWithRetryPolicyOptionsBackup((attempt, exception) => (exception as BatchException)?.RequestInformation?.RetryAfter) .SetOnRetryBehavior(onRetry: LogRetryErrorOnRetryHandler()) .AsyncBuild(); diff --git a/src/TesApi.Web/Management/ArmBatchQuotaProvider.cs b/src/TesApi.Web/Management/ArmBatchQuotaProvider.cs index 5bdd63ace..b32df2e54 100644 --- a/src/TesApi.Web/Management/ArmBatchQuotaProvider.cs +++ b/src/TesApi.Web/Management/ArmBatchQuotaProvider.cs @@ -9,7 +9,6 @@ using Microsoft.Extensions.Caching.Memory; using Microsoft.Extensions.Logging; using TesApi.Web.Management.Models.Quotas; -using static TesApi.Web.Management.IBatchQuotaProvider; namespace TesApi.Web.Management; diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index b68cda0ff..71cf7e222 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -61,12 +61,11 @@ protected sealed override async Task ExecuteAsync(CancellationToken stoppingToke { Logger.LogCritical(ex, "Service {ServiceName} was unable to initialize due to '{Message}'.", GetType().Name, ex.Message); hostApplicationLifetime?.StopApplication(); + return; } Logger.LogInformation(MarkLogMessage("started.")); - await ExecuteCoreAsync(stoppingToken); - Logger.LogInformation(MarkLogMessage("gracefully stopped.")); } diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index b56eb34a7..d03101f7b 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -85,6 +85,11 @@ protected override async ValueTask ExecuteSetupAsync(CancellationToken cancellat throw; } + if (cancellationToken.IsCancellationRequested) + { + return; + } + Logger.LogDebug(@"Querying active tasks"); foreach (var tesTask in @@ -131,23 +136,32 @@ protected override async ValueTask ExecuteCoreAsync(CancellationToken cancellati stoppingToken = cancellationToken; List queuedTasks = []; - while (queuedTesTasks.TryDequeue(out var tesTask)) + while (!cancellationToken.IsCancellationRequested && queuedTesTasks.TryDequeue(out var tesTask)) { queuedTasks.Add(((ITaskScheduler)this).ProcessQueuedTesTaskAsync(tesTask, cancellationToken)); } - while (tesTaskBatchStates.TryDequeue(out var result)) + while (!cancellationToken.IsCancellationRequested && tesTaskBatchStates.TryDequeue(out var result)) { queuedTasks.Add(ProcessQueuedTesTaskStatesRequestAsync(result.TesTasks, result.TaskStates, result.Channel, cancellationToken)); } + if (cancellationToken.IsCancellationRequested) + { + return; + } + queuedTasks.Add(ExecuteShortBackgroundTasksAsync(cancellationToken)); queuedTasks.Add(ExecuteLongBackgroundTasksAsync(cancellationToken)); queuedTasks.Add(ExecuteCancelledTesTasksOnBatchAsync(cancellationToken)); queuedTasks.Add(ExecuteUpdateTesTaskFromEventBlobAsync(cancellationToken)); - Logger.LogDebug(@"Task load: {TaskCount}", queuedTasks.Count); + if (cancellationToken.IsCancellationRequested) + { + return; + } + Logger.LogDebug(@"Task load: {TaskCount}", queuedTasks.Count); await Task.WhenAll(queuedTasks); } diff --git a/src/TesApi.Web/TesApi.Web.csproj b/src/TesApi.Web/TesApi.Web.csproj index 0b192ddfc..424144bda 100644 --- a/src/TesApi.Web/TesApi.Web.csproj +++ b/src/TesApi.Web/TesApi.Web.csproj @@ -79,11 +79,10 @@ - - + diff --git a/src/deploy-tes-on-azure/Deployer.cs b/src/deploy-tes-on-azure/Deployer.cs index a3295a8c5..fc9910d5e 100644 --- a/src/deploy-tes-on-azure/Deployer.cs +++ b/src/deploy-tes-on-azure/Deployer.cs @@ -45,14 +45,11 @@ using CommonUtilities; using CommonUtilities.AzureCloud; using k8s; -using Microsoft.EntityFrameworkCore; using Microsoft.Graph; using Newtonsoft.Json; -using Polly; -using Polly.Retry; -using Tes.Extensions; using Tes.Models; using Tes.SDK; +using static CommonUtilities.RetryHandler; using Batch = Azure.ResourceManager.Batch.Models; using Storage = Azure.ResourceManager.Storage.Models; @@ -60,26 +57,34 @@ namespace TesDeployer { public class Deployer(Configuration configuration) { - private static readonly AsyncRetryPolicy roleAssignmentHashConflictRetryPolicy = Policy - .Handle(requestFailedException => - "HashConflictOnDifferentRoleAssignmentIds".Equals(requestFailedException.ErrorCode)) - .RetryAsync(); - - private static readonly AsyncRetryPolicy operationNotAllowedConflictRetryPolicy = Policy - .Handle(azureException => + private static readonly AsyncRetryHandlerPolicy roleAssignmentHashConflictRetryPolicy = new RetryPolicyBuilder(Microsoft.Extensions.Options.Options.Create(new CommonUtilities.Options.RetryPolicyOptions())) + .PolicyBuilder.OpinionatedRetryPolicy(Polly.Policy.Handle(requestFailedException => + "HashConflictOnDifferentRoleAssignmentIds".Equals(requestFailedException.ErrorCode))) + .WithCustomizedRetryPolicyOptionsWait(int.MaxValue, (_, _) => TimeSpan.Zero) + .SetOnRetryBehavior() + .AsyncBuild(); + + private static readonly AsyncRetryHandlerPolicy operationNotAllowedConflictRetryPolicy = new RetryPolicyBuilder(Microsoft.Extensions.Options.Options.Create(new CommonUtilities.Options.RetryPolicyOptions())) + .PolicyBuilder.OpinionatedRetryPolicy(Polly.Policy.Handle(azureException => (int)HttpStatusCode.Conflict == azureException.Status && - "OperationNotAllowed".Equals(azureException.ErrorCode)) - .WaitAndRetryAsync(30, retryAttempt => TimeSpan.FromSeconds(10)); - - private static readonly AsyncRetryPolicy generalRetryPolicy = Policy - .Handle() - .WaitAndRetryAsync(3, retryAttempt => TimeSpan.FromSeconds(1)); - - private static readonly AsyncRetryPolicy internalServerErrorRetryPolicy = Policy - .Handle(azureException => + "OperationNotAllowed".Equals(azureException.ErrorCode))) + .WithCustomizedRetryPolicyOptionsWait(30, (_, _) => TimeSpan.FromSeconds(10)) + .SetOnRetryBehavior() + .AsyncBuild(); + + private static readonly AsyncRetryHandlerPolicy generalRetryPolicy = new RetryPolicyBuilder(Microsoft.Extensions.Options.Options.Create(new CommonUtilities.Options.RetryPolicyOptions())) + .PolicyBuilder.OpinionatedRetryPolicy() + .WithCustomizedRetryPolicyOptionsWait(3, (_, _) => TimeSpan.FromSeconds(1)) + .SetOnRetryBehavior() + .AsyncBuild(); + + private static readonly AsyncRetryHandlerPolicy internalServerErrorRetryPolicy = new RetryPolicyBuilder(Microsoft.Extensions.Options.Options.Create(new CommonUtilities.Options.RetryPolicyOptions())) + .PolicyBuilder.OpinionatedRetryPolicy(Polly.Policy.Handle(azureException => (int)HttpStatusCode.OK == azureException.Status && - "InternalServerError".Equals(azureException.ErrorCode)) - .WaitAndRetryAsync(3, retryAttempt => longRetryWaitTime); + "InternalServerError".Equals(azureException.ErrorCode))) + .WithCustomizedRetryPolicyOptionsWait(3, (_, _) => longRetryWaitTime) + .SetOnRetryBehavior() + .AsyncBuild(); private static readonly TimeSpan longRetryWaitTime = TimeSpan.FromSeconds(15); @@ -383,7 +388,7 @@ await Execute("Connecting to Azure Services...", async () => if (installedVersion is null || installedVersion < new Version(5, 2, 2)) { - await operationNotAllowedConflictRetryPolicy.ExecuteAsync(() => EnableWorkloadIdentity(aksCluster, managedIdentity, resourceGroup)); + await operationNotAllowedConflictRetryPolicy.ExecuteWithRetryAsync(() => EnableWorkloadIdentity(aksCluster, managedIdentity, resourceGroup)); await kubernetesManager.RemovePodAadChart(); } @@ -1630,7 +1635,7 @@ private async Task CreatePostgreSqlServerAndDa var server = await Execute( $"Creating Azure Flexible Server for PostgreSQL: {configuration.PostgreSqlServerName}...", - async () => (await internalServerErrorRetryPolicy.ExecuteAsync(token => resourceGroup.GetPostgreSqlFlexibleServers().CreateOrUpdateAsync(WaitUntil.Completed, configuration.PostgreSqlServerName, data, token), cts.Token)).Value); + async () => (await internalServerErrorRetryPolicy.ExecuteWithRetryAsync(token => resourceGroup.GetPostgreSqlFlexibleServers().CreateOrUpdateAsync(WaitUntil.Completed, configuration.PostgreSqlServerName, data, token), cts.Token)).Value); await Execute( $"Creating PostgreSQL tes database: {configuration.PostgreSqlTesDatabaseName}...", @@ -1686,7 +1691,7 @@ private async Task AssignRoleToResourceAsync(UserAssignedIdentityResource return false; } - await Execute(message, () => roleAssignmentHashConflictRetryPolicy.ExecuteAsync(token => + await Execute(message, () => roleAssignmentHashConflictRetryPolicy.ExecuteWithRetryAsync(token => (Task)resource.GetRoleAssignments().CreateOrUpdateAsync(WaitUntil.Completed, Guid.NewGuid().ToString(), new(roleDefinitionId, managedIdentity.Data.PrincipalId.Value) { @@ -2274,7 +2279,7 @@ private static void AddServiceEndpointsToSubnet(SubnetData subnet) private async Task ValidateVmAsync() { - var computeSkus = await generalRetryPolicy.ExecuteAsync(async ct => + var computeSkus = await generalRetryPolicy.ExecuteWithRetryAsync(async ct => await armSubscription.GetComputeResourceSkusAsync( filter: $"location eq '{configuration.RegionName}'", cancellationToken: ct) diff --git a/src/deploy-tes-on-azure/KubernetesManager.cs b/src/deploy-tes-on-azure/KubernetesManager.cs index 288b0069c..c41cd02ce 100644 --- a/src/deploy-tes-on-azure/KubernetesManager.cs +++ b/src/deploy-tes-on-azure/KubernetesManager.cs @@ -11,15 +11,14 @@ using System.Text; using System.Threading; using System.Threading.Tasks; -using Azure.ResourceManager; using Azure.ResourceManager.ContainerService; using Azure.ResourceManager.ManagedServiceIdentities; using Azure.Storage.Blobs; +using CommonUtilities; using CommonUtilities.AzureCloud; using k8s; using k8s.Models; -using Polly; -using Polly.Retry; +using static CommonUtilities.RetryHandler; namespace TesDeployer { @@ -28,13 +27,17 @@ namespace TesDeployer /// public class KubernetesManager { - private static readonly AsyncRetryPolicy WorkloadReadyRetryPolicy = Policy - .Handle() - .WaitAndRetryAsync(80, retryAttempt => TimeSpan.FromSeconds(15)); - - private static readonly AsyncRetryPolicy KubeExecRetryPolicy = Policy - .Handle(ex => ex.WebSocketErrorCode == WebSocketError.NotAWebSocket) - .WaitAndRetryAsync(200, retryAttempt => TimeSpan.FromSeconds(5)); + private static readonly AsyncRetryHandlerPolicy WorkloadReadyRetryPolicy = new RetryPolicyBuilder(Microsoft.Extensions.Options.Options.Create(new CommonUtilities.Options.RetryPolicyOptions())) + .PolicyBuilder.OpinionatedRetryPolicy() + .WithCustomizedRetryPolicyOptionsWait(80, (_, _) => TimeSpan.FromSeconds(15)) + .SetOnRetryBehavior() + .AsyncBuild(); + + private static readonly AsyncRetryHandlerPolicy KubeExecRetryPolicy = new RetryPolicyBuilder(Microsoft.Extensions.Options.Options.Create(new CommonUtilities.Options.RetryPolicyOptions())) + .PolicyBuilder.OpinionatedRetryPolicy(Polly.Policy.Handle(ex => ex.WebSocketErrorCode == WebSocketError.NotAWebSocket)) + .WithCustomizedRetryPolicyOptionsWait(200, (_, _) => TimeSpan.FromSeconds(5)) + .SetOnRetryBehavior() + .AsyncBuild(); private const string NginxIngressRepo = "https://kubernetes.github.io/ingress-nginx"; private const string NginxIngressVersion = "4.7.1"; @@ -363,7 +366,7 @@ async Task StreamHandler(Stream stream) } }, cancellationToken); - if (result.Outcome != OutcomeType.Successful && result.FinalException is not null) + if (result.Outcome != Polly.OutcomeType.Successful && result.FinalException is not null) { throw result.FinalException; } @@ -638,7 +641,7 @@ private static async Task WaitForWorkloadAsync(IKubernetes client, string } }, cancellationToken); - return result.Outcome == OutcomeType.Successful; + return result.Outcome == Polly.OutcomeType.Successful; } public class HelmValues From ce2ca2800287e7320c48ae121639b1bfa3062498 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 18 Oct 2024 16:36:53 -0700 Subject: [PATCH 158/202] formatting --- src/CommonUtilities/RetryPolicyBuilder.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CommonUtilities/RetryPolicyBuilder.cs b/src/CommonUtilities/RetryPolicyBuilder.cs index aeeaab447..df9f65dc1 100644 --- a/src/CommonUtilities/RetryPolicyBuilder.cs +++ b/src/CommonUtilities/RetryPolicyBuilder.cs @@ -576,7 +576,7 @@ private static Func, Context, TimeSpan> CombineSlee { List> stored; - if(attempt == 1) + if (attempt == 1) { context[RetryHandler.CombineSleepDurationsKey] = stored = combined; } From d48a5fee296d11b6803fdca958097c2a5376dbc7 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 18 Oct 2024 16:37:20 -0700 Subject: [PATCH 159/202] attempted fix --- src/TesApi.Web/TaskScheduler.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index d03101f7b..562c745e8 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -94,7 +94,7 @@ protected override async ValueTask ExecuteSetupAsync(CancellationToken cancellat foreach (var tesTask in (await Repository.GetItemsAsync( - predicate: t => t.IsActiveState(false), // TODO: preemptedIsTerminal + predicate: t => !TesTask.TerminalStates.Contains(t.State), cancellationToken: cancellationToken)) .OrderBy(t => t.CreationTime)) { From 32dd178b03045560e8d8ad5555f5df15ddafdf03 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 21 Oct 2024 11:59:43 -0700 Subject: [PATCH 160/202] Various fixes --- src/Tes.ApiClients/HttpApiClient.cs | 22 ++++++---- ...askPostgreSqlRepositoryIntegrationTests.cs | 2 +- src/TesApi.Web/Controllers/TaskServiceApi.cs | 11 ++--- src/TesApi.Web/TaskScheduler.cs | 42 ++++++++++++------- 4 files changed, 46 insertions(+), 31 deletions(-) diff --git a/src/Tes.ApiClients/HttpApiClient.cs b/src/Tes.ApiClients/HttpApiClient.cs index 5ee9af9bc..e1c8289ad 100644 --- a/src/Tes.ApiClients/HttpApiClient.cs +++ b/src/Tes.ApiClients/HttpApiClient.cs @@ -136,14 +136,15 @@ protected async Task HttpSendRequestWithRetryPolicyAsync( /// Response's content deserialization type. /// protected async Task HttpGetRequestAsync(Uri requestUrl, bool setAuthorizationHeader, - bool cacheResults, JsonTypeInfo typeInfo, CancellationToken cancellationToken) + bool cacheResults, JsonTypeInfo typeInfo, CancellationToken cancellationToken, + [System.Runtime.CompilerServices.CallerMemberName] string caller = default) { if (cacheResults) { - return await HttpGetRequestWithCachingAndRetryPolicyAsync(requestUrl, typeInfo, cancellationToken, setAuthorizationHeader); + return await HttpGetRequestWithCachingAndRetryPolicyAsync(requestUrl, typeInfo, cancellationToken, setAuthorizationHeader, caller); } - return await HttpGetRequestWithRetryPolicyAsync(requestUrl, typeInfo, cancellationToken, setAuthorizationHeader); + return await HttpGetRequestWithRetryPolicyAsync(requestUrl, typeInfo, cancellationToken, setAuthorizationHeader, caller); } /// @@ -156,7 +157,8 @@ protected async Task HttpGetRequestAsync(Uri requestUrl, b /// Response's content deserialization type. /// protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync(Uri requestUrl, - JsonTypeInfo typeInfo, CancellationToken cancellationToken, bool setAuthorizationHeader = false) + JsonTypeInfo typeInfo, CancellationToken cancellationToken, bool setAuthorizationHeader = false, + [System.Runtime.CompilerServices.CallerMemberName] string caller = default) { var cacheKey = await ToCacheKeyAsync(requestUrl, setAuthorizationHeader, cancellationToken); @@ -168,7 +170,7 @@ protected async Task HttpGetRequestWithCachingAndRetryPolicyAsync GetApiResponseContentAsync(m, typeInfo, ct), - cancellationToken))!; + cancellationToken, caller))!; } /// @@ -207,7 +209,8 @@ protected async Task HttpGetRequestWithExpirableCachingAndRetryPolicy /// Response's content deserialization type. /// protected async Task HttpGetRequestWithRetryPolicyAsync(Uri requestUrl, - JsonTypeInfo typeInfo, CancellationToken cancellationToken, bool setAuthorizationHeader = false) + JsonTypeInfo typeInfo, CancellationToken cancellationToken, bool setAuthorizationHeader = false, + [System.Runtime.CompilerServices.CallerMemberName] string caller = default) { return await cachingRetryHandler.ExecuteWithRetryAndConversionAsync(async ct => { @@ -217,7 +220,7 @@ protected async Task HttpGetRequestWithRetryPolicyAsync(Ur return await HttpClient.SendAsync(httpRequest, ct); }, (m, ct) => GetApiResponseContentAsync(m, typeInfo, ct), - cancellationToken); + cancellationToken, caller); } /// @@ -271,7 +274,8 @@ private async Task CreateGetHttpRequest(Uri requestUrl, bool /// Response's content deserialization type. /// protected async Task HttpGetRequestWithRetryPolicyAsync( - Func httpRequestFactory, JsonTypeInfo typeInfo, CancellationToken cancellationToken, bool setAuthorizationHeader = false) + Func httpRequestFactory, JsonTypeInfo typeInfo, CancellationToken cancellationToken, bool setAuthorizationHeader = false, + [System.Runtime.CompilerServices.CallerMemberName] string caller = default) { return await cachingRetryHandler.ExecuteWithRetryAndConversionAsync(async ct => { @@ -285,7 +289,7 @@ protected async Task HttpGetRequestWithRetryPolicyAsync( return await HttpClient.SendAsync(request, ct); }, (m, ct) => GetApiResponseContentAsync(m, typeInfo, ct), - cancellationToken); + cancellationToken, caller); } private async Task AddAuthorizationHeaderToRequestAsync(HttpRequestMessage requestMessage, CancellationToken cancellationToken) diff --git a/src/TesApi.Tests/Repository/TesTaskPostgreSqlRepositoryIntegrationTests.cs b/src/TesApi.Tests/Repository/TesTaskPostgreSqlRepositoryIntegrationTests.cs index 6251be0c4..2d3d64721 100644 --- a/src/TesApi.Tests/Repository/TesTaskPostgreSqlRepositoryIntegrationTests.cs +++ b/src/TesApi.Tests/Repository/TesTaskPostgreSqlRepositoryIntegrationTests.cs @@ -139,7 +139,7 @@ public async Task ListTasksWithPagingRetrievesAllTesTasks() await Parallel.ForEachAsync(items, CancellationToken.None, async (item, token) => await repository.CreateItemAsync(item, token)); } - var controller = new TaskServiceApiController(repository, null, null, null, null); + var controller = new TaskServiceApiController(repository, null, null, null); string pageToken = null; var tesTaskIds = new HashSet(); diff --git a/src/TesApi.Web/Controllers/TaskServiceApi.cs b/src/TesApi.Web/Controllers/TaskServiceApi.cs index c4090cb6a..b283c2469 100644 --- a/src/TesApi.Web/Controllers/TaskServiceApi.cs +++ b/src/TesApi.Web/Controllers/TaskServiceApi.cs @@ -28,7 +28,6 @@ using Tes.TaskSubmitters; using TesApi.Attributes; using TesApi.Web; -using TesApi.Web.Storage; namespace TesApi.Controllers { @@ -36,19 +35,17 @@ namespace TesApi.Controllers /// API endpoints for s. /// /// - /// Contruct a + /// Construct a /// /// The main database repository - /// The storage access provider /// The task scheduler - /// The logger instance /// The GA4GH TES service information - public class TaskServiceApiController(IRepository repository, IStorageAccessProvider storageAccessProvider, ITaskScheduler taskScheduler, ILogger logger, TesServiceInfo serviceInfo) + /// The logger instance + public class TaskServiceApiController(IRepository repository, ITaskScheduler taskScheduler, TesServiceInfo serviceInfo, ILogger logger) : ControllerBase { //private const string rootExecutionPath = "/cromwell-executions"; private readonly IRepository repository = repository; - private readonly IStorageAccessProvider storageAccessProvider = storageAccessProvider; private readonly ITaskScheduler taskScheduler = taskScheduler; private readonly ILogger logger = logger; private readonly TesServiceInfo serviceInfo = serviceInfo; @@ -268,7 +265,7 @@ public virtual async Task CreateTaskAsync([FromBody] TesTask tesT } logger.LogDebug("Creating task with id {TesTask} state {TesTaskState}", tesTask.Id, tesTask.State); - await repository.CreateItemAsync(tesTask, cancellationToken); + tesTask = await repository.CreateItemAsync(tesTask, cancellationToken); await taskScheduler.ProcessQueuedTesTaskAsync(tesTask, cancellationToken); return StatusCode(200, new TesCreateTaskResponse { Id = tesTask.Id }); } diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index 562c745e8..20d8005f4 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -54,6 +54,7 @@ internal class TaskScheduler(RunnerEventsProcessor nodeEventProcessor, Microsoft , ITaskScheduler { private static readonly TimeSpan blobRunInterval = TimeSpan.FromSeconds(15); + private static readonly TimeSpan queuedRunInterval = TimeSpan.FromMilliseconds(100); internal static readonly TimeSpan BatchRunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs appears to use a 30 second polling interval private static readonly TimeSpan shortBackgroundRunInterval = TimeSpan.FromSeconds(1); private static readonly TimeSpan longBackgroundRunInterval = TimeSpan.FromSeconds(1); @@ -136,11 +137,6 @@ protected override async ValueTask ExecuteCoreAsync(CancellationToken cancellati stoppingToken = cancellationToken; List queuedTasks = []; - while (!cancellationToken.IsCancellationRequested && queuedTesTasks.TryDequeue(out var tesTask)) - { - queuedTasks.Add(((ITaskScheduler)this).ProcessQueuedTesTaskAsync(tesTask, cancellationToken)); - } - while (!cancellationToken.IsCancellationRequested && tesTaskBatchStates.TryDequeue(out var result)) { queuedTasks.Add(ProcessQueuedTesTaskStatesRequestAsync(result.TesTasks, result.TaskStates, result.Channel, cancellationToken)); @@ -153,6 +149,7 @@ protected override async ValueTask ExecuteCoreAsync(CancellationToken cancellati queuedTasks.Add(ExecuteShortBackgroundTasksAsync(cancellationToken)); queuedTasks.Add(ExecuteLongBackgroundTasksAsync(cancellationToken)); + queuedTasks.Add(ExecuteQueuedTesTasksOnBatchAsync(cancellationToken)); queuedTasks.Add(ExecuteCancelledTesTasksOnBatchAsync(cancellationToken)); queuedTasks.Add(ExecuteUpdateTesTaskFromEventBlobAsync(cancellationToken)); @@ -182,6 +179,29 @@ private async Task ProcessQueuedTesTaskStatesRequestAsync(TesTask[] tesTasks, Az } } + /// + /// Retrieves TesTasks queued via ProcessQueuedTesTaskAsync and schedules them for execution. + /// + /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. + /// + private Task ExecuteQueuedTesTasksOnBatchAsync(CancellationToken cancellationToken) + { + return ExecuteActionOnIntervalAsync(queuedRunInterval, ProcessQueuedTesTasksAsync, cancellationToken); + } + + /// + /// Schedules queued TesTasks via !BatchScheduler.ProcessQueuedTesTaskAsync. + /// + /// Triggered when Microsoft.Extensions.Hosting.IHostedService.StopAsync(System.Threading.CancellationToken) is called. + /// + private async ValueTask ProcessQueuedTesTasksAsync(CancellationToken cancellationToken) + { + while (!cancellationToken.IsCancellationRequested && queuedTesTasks.TryDequeue(out var tesTask)) + { + await ProcessOrchestratedTesTaskAsync("Queued", new(BatchScheduler.ProcessQueuedTesTaskAsync(tesTask, cancellationToken), tesTask), cancellationToken); + } + } + /// /// Retrieves all event blobs from storage and updates the resultant state. /// @@ -367,16 +387,10 @@ await Parallel.ForEachAsync(eventStates.Select(@event => @event.MarkProcessedAsy } /// - async Task ITaskScheduler.ProcessQueuedTesTaskAsync(TesTask tesTask, CancellationToken cancellationToken) + Task ITaskScheduler.ProcessQueuedTesTaskAsync(TesTask tesTask, CancellationToken cancellationToken) { - if (IsRunning) - { - await ProcessOrchestratedTesTaskAsync("Queued", new(BatchScheduler.ProcessQueuedTesTaskAsync(tesTask, cancellationToken), tesTask), cancellationToken); - } - else - { - queuedTesTasks.Enqueue(tesTask); - } + queuedTesTasks.Enqueue(tesTask); + return Task.CompletedTask; } /// From 112a556386781bb2109aa82e6734b0975da2e523 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 21 Oct 2024 13:47:45 -0700 Subject: [PATCH 161/202] fix tests --- .../TaskServiceApiControllerTests.cs | 36 ++++++++++--------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/TesApi.Tests/TaskServiceApiControllerTests.cs b/src/TesApi.Tests/TaskServiceApiControllerTests.cs index c4f25c114..d9888dc17 100644 --- a/src/TesApi.Tests/TaskServiceApiControllerTests.cs +++ b/src/TesApi.Tests/TaskServiceApiControllerTests.cs @@ -12,7 +12,6 @@ using System.Threading.Tasks; using Microsoft.AspNetCore.Http; using Microsoft.AspNetCore.Mvc; -using Microsoft.Azure.Batch; using Microsoft.VisualStudio.TestTools.UnitTesting; using Moq; using Tes.Models; @@ -24,6 +23,11 @@ namespace TesApi.Tests [TestClass] public class TaskServiceApiControllerTests { + private void SetRepository(Mock> mock) + { + mock.Setup(x => x.CreateItemAsync(It.IsAny(), It.IsAny())).Returns((tesTask, _) => Task.FromResult(tesTask)); + } + [TestCategory("TES 1.1")] [TestMethod] public async Task TES_Supports_BackendParameter_vmsize() @@ -41,7 +45,7 @@ public async Task TES_Supports_BackendParameter_vmsize() Resources = new() { BackendParameters = backendParameters, BackendParametersStrict = true } }; - using var services = new TestServices.TestServiceProvider(); + using var services = new TestServices.TestServiceProvider(tesTaskRepository: SetRepository); var controller = services.GetT(); var result = await controller.CreateTaskAsync(tesTask, CancellationToken.None) as ObjectResult; @@ -71,7 +75,7 @@ public async Task TES_Supports_BackendParameter_workflow_execution_identity() Resources = new() { BackendParameters = backendParameters, BackendParametersStrict = true } }; - using var services = new TestServices.TestServiceProvider(); + using var services = new TestServices.TestServiceProvider(tesTaskRepository: SetRepository); var controller = services.GetT(); var result = await controller.CreateTaskAsync(tesTask, CancellationToken.None) as ObjectResult; @@ -101,7 +105,7 @@ public async Task CreateTaskAsync_ReturnsTesCreateTaskResponseWithBackendParamet Resources = new() { BackendParameters = backendParameters } }; - using var services = new TestServices.TestServiceProvider(); + using var services = new TestServices.TestServiceProvider(tesTaskRepository: SetRepository); var controller = services.GetT(); var result = await controller.CreateTaskAsync(tesTask, CancellationToken.None) as ObjectResult; @@ -133,7 +137,7 @@ public async Task CreateTaskAsync_ReturnsBadRequest_ForBackendParametersStrict_U Resources = new() { BackendParameters = backendParameters, BackendParametersStrict = true } }; - using var services = new TestServices.TestServiceProvider(); + using var services = new TestServices.TestServiceProvider(tesTaskRepository: SetRepository); var controller = services.GetT(); var result = await controller.CreateTaskAsync(tesTask, CancellationToken.None) as BadRequestObjectResult; @@ -165,7 +169,7 @@ public async Task CreateTaskAsync_ReturnsBadRequest_ForBackendParametersStrict_D Resources = new() { BackendParameters = backendParameters, BackendParametersStrict = true } }; - using var services = new TestServices.TestServiceProvider(); + using var services = new TestServices.TestServiceProvider(tesTaskRepository: SetRepository); var controller = services.GetT(); var result = await controller.CreateTaskAsync(tesTask, CancellationToken.None) as BadRequestObjectResult; @@ -179,7 +183,7 @@ public async Task CreateTaskAsync_ReturnsBadRequest_ForBackendParametersStrict_D public async Task CreateTaskAsync_ReturnsBadRequest_ForInvalidId() { var tesTask = new TesTask { Id = "ClientProvidedId", Executors = [new() { Image = "ubuntu", Command = ["cmd"] }] }; - using var services = new TestServices.TestServiceProvider(); + using var services = new TestServices.TestServiceProvider(tesTaskRepository: SetRepository); var controller = services.GetT(); var result = await controller.CreateTaskAsync(tesTask, CancellationToken.None) as BadRequestObjectResult; @@ -192,7 +196,7 @@ public async Task CreateTaskAsync_ReturnsBadRequest_ForInvalidId() public async Task CreateTaskAsync_ReturnsBadRequest_ForMissingDockerImage() { TesTask tesTask = new() { Executors = [new()] }; - using var services = new TestServices.TestServiceProvider(); + using var services = new TestServices.TestServiceProvider(tesTaskRepository: SetRepository); var controller = services.GetT(); var result = await controller.CreateTaskAsync(tesTask, CancellationToken.None) as ObjectResult; @@ -205,7 +209,7 @@ public async Task CreateTaskAsync_ReturnsBadRequest_ForMissingDockerImage() public async Task CreateTaskAsync_ReturnsBadRequest_ForRelativeInputPath() { TesTask tesTask = new() { Inputs = [new() { Path = "xyz/path" }] }; - using var services = new TestServices.TestServiceProvider(); + using var services = new TestServices.TestServiceProvider(tesTaskRepository: SetRepository); var controller = services.GetT(); var result = await controller.CreateTaskAsync(tesTask, CancellationToken.None) as ObjectResult; @@ -218,7 +222,7 @@ public async Task CreateTaskAsync_ReturnsBadRequest_ForRelativeInputPath() public async Task CreateTaskAsync_ReturnsBadRequest_ForInputMissingContentAndPath() { TesTask tesTask = new() { Inputs = [new() { Url = "http://host/path" }] }; - using var services = new TestServices.TestServiceProvider(); + using var services = new TestServices.TestServiceProvider(tesTaskRepository: SetRepository); var controller = services.GetT(); var result = await controller.CreateTaskAsync(tesTask, CancellationToken.None) as ObjectResult; @@ -231,7 +235,7 @@ public async Task CreateTaskAsync_ReturnsBadRequest_ForInputMissingContentAndPat public async Task CreateTaskAsync_ReturnsBadRequest_ForInputContentAndPath() { TesTask tesTask = new() { Inputs = [new() { Url = "http://host/path", Path = "/path/file", Content = "content" }] }; - using var services = new TestServices.TestServiceProvider(); + using var services = new TestServices.TestServiceProvider(tesTaskRepository: SetRepository); var controller = services.GetT(); var result = await controller.CreateTaskAsync(tesTask, CancellationToken.None) as ObjectResult; @@ -245,7 +249,7 @@ public async Task CreateTaskAsync_ReturnsTesCreateTaskResponse() { var tesTask = new TesTask() { Executors = [new() { Image = "ubuntu", Command = ["cmd"] }] }; - using var services = new TestServices.TestServiceProvider(); + using var services = new TestServices.TestServiceProvider(tesTaskRepository: SetRepository); var controller = services.GetT(); var result = await controller.CreateTaskAsync(tesTask, CancellationToken.None) as ObjectResult; @@ -692,7 +696,7 @@ public async Task CreateTaskAsync_InvalidInputsAndPathDoNotThrow() Executors = [new() { Image = "ubuntu", Command = ["cmd"] }] }; - using var services = new TestServices.TestServiceProvider(); + using var services = new TestServices.TestServiceProvider(tesTaskRepository: SetRepository); var controller = services.GetT(); await controller.CreateTaskAsync(tesTask1, CancellationToken.None); @@ -755,7 +759,7 @@ public async Task CreateTaskAsync_ExtractsCromwellWorkflowId() ] }; - using var services = new TestServices.TestServiceProvider(); + using var services = new TestServices.TestServiceProvider(tesTaskRepository: SetRepository); var controller = services.GetT(); await controller.CreateTaskAsync(tesTask, CancellationToken.None); @@ -784,7 +788,7 @@ public async Task CreateTaskAsync_CromwellWorkflowIdIsUsedAsTaskIdPrefix() ] }; - using var services = new TestServices.TestServiceProvider(); + using var services = new TestServices.TestServiceProvider(tesTaskRepository: SetRepository); var controller = services.GetT(); await controller.CreateTaskAsync(tesTask, CancellationToken.None); @@ -813,7 +817,7 @@ public async Task CreateTaskAsync_CromwellMetadataForTriggerServiceIsGenerated(s ] }; - using var services = new TestServices.TestServiceProvider(); + using var services = new TestServices.TestServiceProvider(tesTaskRepository: SetRepository); var controller = services.GetT(); await controller.CreateTaskAsync(tesTask, CancellationToken.None); From ad64b22bcdd07b58f4cdf94db359ed165755adcf Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 31 Oct 2024 10:45:04 -0700 Subject: [PATCH 162/202] fix repository concurrency detection --- src/Tes.Repository/Models/KeyedDbItem.cs | 13 +++++++++++++ src/Tes.Repository/Models/PostgreSqlOptions.cs | 2 +- src/Tes.Repository/Models/TesTaskPostgres.cs | 11 ++++------- src/Tes.Repository/PostgreSqlCachingRepository.cs | 8 ++++---- src/Tes.Repository/TesDbContext.cs | 3 +-- src/Tes.Repository/TesTaskPostgreSqlRepository.cs | 2 ++ .../Utilities/PostgresConnectionStringUtility.cs | 4 ++-- .../Utilities/PrependableFormattableString.cs | 2 +- .../TesTaskPostgreSqlRepositoryIntegrationTests.cs | 4 ++-- src/TesApi.Tests/TaskServiceApiControllerTests.cs | 2 +- src/TesApi.Web/Startup.cs | 1 + 11 files changed, 32 insertions(+), 20 deletions(-) create mode 100644 src/Tes.Repository/Models/KeyedDbItem.cs diff --git a/src/Tes.Repository/Models/KeyedDbItem.cs b/src/Tes.Repository/Models/KeyedDbItem.cs new file mode 100644 index 000000000..e377a395b --- /dev/null +++ b/src/Tes.Repository/Models/KeyedDbItem.cs @@ -0,0 +1,13 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.ComponentModel.DataAnnotations.Schema; + +namespace Tes.Repository.Models +{ + public abstract class KeyedDbItem + { + [Column("id")] + public long Id { get; set; } + } +} diff --git a/src/Tes.Repository/Models/PostgreSqlOptions.cs b/src/Tes.Repository/Models/PostgreSqlOptions.cs index 65ef70e46..2d389a5b6 100644 --- a/src/Tes.Repository/Models/PostgreSqlOptions.cs +++ b/src/Tes.Repository/Models/PostgreSqlOptions.cs @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -namespace Tes.Models +namespace Tes.Repository.Models { /// /// PostgresSql configuration options diff --git a/src/Tes.Repository/Models/TesTaskPostgres.cs b/src/Tes.Repository/Models/TesTaskPostgres.cs index cb7771f0b..3500c4834 100644 --- a/src/Tes.Repository/Models/TesTaskPostgres.cs +++ b/src/Tes.Repository/Models/TesTaskPostgres.cs @@ -4,19 +4,16 @@ using System.ComponentModel.DataAnnotations.Schema; using Microsoft.EntityFrameworkCore.Metadata.Internal; -namespace Tes.Models +namespace Tes.Repository.Models { /// /// Database schema for encapsulating a TesTask as Json for Postgresql. /// - [Table(Repository.TesDbContext.TesTasksPostgresTableName)] - public class TesTaskDatabaseItem + [Table(TesDbContext.TesTasksPostgresTableName)] + public class TesTaskDatabaseItem : KeyedDbItem { - [Column("id")] - public long Id { get; set; } - [Column("json", TypeName = "jsonb")] - public TesTask Json { get; set; } + public Tes.Models.TesTask Json { get; set; } public TesTaskDatabaseItem Clone() { diff --git a/src/Tes.Repository/PostgreSqlCachingRepository.cs b/src/Tes.Repository/PostgreSqlCachingRepository.cs index f83e740f2..ef688d1cc 100644 --- a/src/Tes.Repository/PostgreSqlCachingRepository.cs +++ b/src/Tes.Repository/PostgreSqlCachingRepository.cs @@ -21,7 +21,7 @@ namespace Tes.Repository /// /// Database table schema class /// Corresponding type for - public abstract class PostgreSqlCachingRepository : IDisposable where TDbItem : class where TItem : RepositoryItem + public abstract class PostgreSqlCachingRepository : IDisposable where TDbItem : Models.KeyedDbItem where TItem : RepositoryItem { private const int BatchSize = 1000; private static readonly TimeSpan defaultCompletedTaskCacheExpiration = TimeSpan.FromDays(1); @@ -34,7 +34,7 @@ public abstract class PostgreSqlCachingRepository : IDisposable private record struct WriteItem(TDbItem DbItem, WriteAction Action, TaskCompletionSource TaskSource); private readonly Channel itemsToWrite = Channel.CreateUnbounded(new() { SingleReader = true }); - private readonly ConcurrentDictionary updatingItems = new(); // Collection of all pending updates to be written, to faciliate detection of simultaneous parallel updates. + private readonly ConcurrentDictionary updatingItems = new(); // Collection of all pending updates to be written, to faciliate detection of simultaneous parallel updates. private readonly CancellationTokenSource writerWorkerCancellationTokenSource = new(); private readonly Task writerWorkerTask; @@ -153,7 +153,7 @@ protected Task AddUpdateOrRemoveItemInDbAsync(TDbItem item, Func(); var result = source.Task; - if (updatingItems.TryAdd(item, null)) + if (updatingItems.TryAdd(item.Id, null)) { result = source.Task.ContinueWith(RemoveUpdatingItem).Unwrap(); } @@ -173,7 +173,7 @@ protected Task AddUpdateOrRemoveItemInDbAsync(TDbItem item, Func RemoveUpdatingItem(Task task) { - _ = updatingItems.Remove(item, out _); + _ = updatingItems.Remove(item.Id, out _); return task.Status switch { TaskStatus.RanToCompletion => Task.FromResult(task.Result), diff --git a/src/Tes.Repository/TesDbContext.cs b/src/Tes.Repository/TesDbContext.cs index 1b92677c2..867fe8d73 100644 --- a/src/Tes.Repository/TesDbContext.cs +++ b/src/Tes.Repository/TesDbContext.cs @@ -5,7 +5,6 @@ using Microsoft.EntityFrameworkCore; using Npgsql; using Npgsql.EntityFrameworkCore.PostgreSQL.Infrastructure; -using Tes.Models; namespace Tes.Repository { @@ -29,7 +28,7 @@ public TesDbContext(NpgsqlDataSource dataSource, Action ContextOptionsBuilder { get; set; } - public DbSet TesTasks { get; set; } + public DbSet TesTasks { get; set; } protected override void OnConfiguring(DbContextOptionsBuilder optionsBuilder) { diff --git a/src/Tes.Repository/TesTaskPostgreSqlRepository.cs b/src/Tes.Repository/TesTaskPostgreSqlRepository.cs index b6ff045cb..c54c1cefe 100644 --- a/src/Tes.Repository/TesTaskPostgreSqlRepository.cs +++ b/src/Tes.Repository/TesTaskPostgreSqlRepository.cs @@ -18,6 +18,8 @@ namespace Tes.Repository using Microsoft.Extensions.Options; using Npgsql; using Tes.Models; + using Tes.Repository.Models; + using Tes.Repository.Utilities; using Tes.Utilities; /// diff --git a/src/Tes.Repository/Utilities/PostgresConnectionStringUtility.cs b/src/Tes.Repository/Utilities/PostgresConnectionStringUtility.cs index 76b3f28cd..b1f175573 100644 --- a/src/Tes.Repository/Utilities/PostgresConnectionStringUtility.cs +++ b/src/Tes.Repository/Utilities/PostgresConnectionStringUtility.cs @@ -4,9 +4,9 @@ using System; using System.Text; using Microsoft.Extensions.Options; -using Tes.Models; +using Tes.Repository.Models; -namespace Tes.Utilities +namespace Tes.Repository.Utilities { public static class ConnectionStringUtility { diff --git a/src/Tes.Repository/Utilities/PrependableFormattableString.cs b/src/Tes.Repository/Utilities/PrependableFormattableString.cs index f691bff99..6d6dc785c 100644 --- a/src/Tes.Repository/Utilities/PrependableFormattableString.cs +++ b/src/Tes.Repository/Utilities/PrependableFormattableString.cs @@ -3,7 +3,7 @@ using System; -namespace Tes.Utilities +namespace Tes.Repository.Utilities { internal class PrependableFormattableString : FormattableString { diff --git a/src/TesApi.Tests/Repository/TesTaskPostgreSqlRepositoryIntegrationTests.cs b/src/TesApi.Tests/Repository/TesTaskPostgreSqlRepositoryIntegrationTests.cs index 2d3d64721..d541d2bff 100644 --- a/src/TesApi.Tests/Repository/TesTaskPostgreSqlRepositoryIntegrationTests.cs +++ b/src/TesApi.Tests/Repository/TesTaskPostgreSqlRepositoryIntegrationTests.cs @@ -17,7 +17,7 @@ using Microsoft.Extensions.Options; using Microsoft.VisualStudio.TestTools.UnitTesting; using Tes.Models; -using Tes.Utilities; +using Tes.Repository.Utilities; using TesApi.Controllers; namespace Tes.Repository.Tests @@ -53,7 +53,7 @@ await PostgreSqlTestUtility.CreateTestDbAsync( subscriptionId, regionName, resourceGroupName, postgreSqlServerName, postgreSqlDatabaseName, adminLogin, adminPw); var connectionString = ConnectionStringUtility.GetPostgresConnectionString( - Options.Create(new PostgreSqlOptions + Options.Create(new Models.PostgreSqlOptions { ServerName = postgreSqlServerName, DatabaseName = postgreSqlDatabaseName, diff --git a/src/TesApi.Tests/TaskServiceApiControllerTests.cs b/src/TesApi.Tests/TaskServiceApiControllerTests.cs index d9888dc17..239aca283 100644 --- a/src/TesApi.Tests/TaskServiceApiControllerTests.cs +++ b/src/TesApi.Tests/TaskServiceApiControllerTests.cs @@ -593,7 +593,7 @@ private static TestServices.TestServiceProvider SetupT new(null, [])); r.Setup(repo => repo .JsonFormattableRawString(It.IsAny(), It.IsAny())) - .Returns((string property, FormattableString sql) => new Tes.Utilities.PrependableFormattableString($"{property}:", sql)); + .Returns((string property, FormattableString sql) => new Tes.Repository.Utilities.PrependableFormattableString($"{property}:", sql)); }); private static Task ListTasksWithTagArgumentsAsync(TaskServiceApiController controller, string query) diff --git a/src/TesApi.Web/Startup.cs b/src/TesApi.Web/Startup.cs index 40052e806..7075e2b40 100644 --- a/src/TesApi.Web/Startup.cs +++ b/src/TesApi.Web/Startup.cs @@ -25,6 +25,7 @@ using Tes.ApiClients; using Tes.Models; using Tes.Repository; +using Tes.Repository.Models; using TesApi.Filters; using TesApi.Web.Management; using TesApi.Web.Management.Batch; From 17a0cd1202ff6118fb45fe0ed9076ebee09c4e7f Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 31 Oct 2024 14:22:18 -0700 Subject: [PATCH 163/202] Handle RepositoryCollisionException --- .../PostgreSqlCachingRepository.cs | 4 +- .../RepositoryCollisionException.cs | 14 +-- src/Tes.SDK/TesClient.cs | 2 - .../TaskServiceApiControllerTests.cs | 2 +- src/TesApi.Web/Controllers/TaskServiceApi.cs | 2 +- .../OrchestrateOnBatchSchedulerServiceBase.cs | 34 +++--- src/TesApi.Web/PoolScheduler.cs | 45 ++++++-- src/TesApi.Web/RepositoryRetryHandler.cs | 4 +- src/TesApi.Web/TaskScheduler.cs | 105 ++++++++++++++---- 9 files changed, 154 insertions(+), 58 deletions(-) diff --git a/src/Tes.Repository/PostgreSqlCachingRepository.cs b/src/Tes.Repository/PostgreSqlCachingRepository.cs index ef688d1cc..6b36420b6 100644 --- a/src/Tes.Repository/PostgreSqlCachingRepository.cs +++ b/src/Tes.Repository/PostgreSqlCachingRepository.cs @@ -150,6 +150,8 @@ protected async Task> GetItemsAsync(DbSet dbSet, C /// protected Task AddUpdateOrRemoveItemInDbAsync(TDbItem item, Func getItem, WriteAction action, CancellationToken cancellationToken) { + ArgumentNullException.ThrowIfNull(getItem); + var source = new TaskCompletionSource(); var result = source.Task; @@ -161,7 +163,7 @@ protected Task AddUpdateOrRemoveItemInDbAsync(TDbItem item, Func( "Respository concurrency failure: attempt to update item with previously queued update pending.", - result.ContinueWith(task => getItem(task.Result), TaskContinuationOptions.OnlyOnRanToCompletion)); + getItem(item)); } if (!itemsToWrite.Writer.TryWrite(new(item, action, source))) diff --git a/src/Tes.Repository/RepositoryCollisionException.cs b/src/Tes.Repository/RepositoryCollisionException.cs index 4e2f367ab..3b6b8b9b6 100644 --- a/src/Tes.Repository/RepositoryCollisionException.cs +++ b/src/Tes.Repository/RepositoryCollisionException.cs @@ -7,21 +7,21 @@ namespace Tes.Repository { public class RepositoryCollisionException : Exception where T : RepositoryItem { - public System.Threading.Tasks.Task Task { get; } + public T RepositoryItem { get; } - public RepositoryCollisionException(System.Threading.Tasks.Task task) + public RepositoryCollisionException(T repositoryItem) { - Task = task; + RepositoryItem = repositoryItem; } - public RepositoryCollisionException(string message, System.Threading.Tasks.Task task) : base(message) + public RepositoryCollisionException(string message, T repositoryItem) : base(message) { - Task = task; + RepositoryItem = repositoryItem; } - public RepositoryCollisionException(string message, Exception innerException, System.Threading.Tasks.Task task) : base(message, innerException) + public RepositoryCollisionException(string message, Exception innerException, T repositoryItem) : base(message, innerException) { - Task = task; + RepositoryItem = repositoryItem; } } } diff --git a/src/Tes.SDK/TesClient.cs b/src/Tes.SDK/TesClient.cs index a3ce46e63..75c14906b 100644 --- a/src/Tes.SDK/TesClient.cs +++ b/src/Tes.SDK/TesClient.cs @@ -4,8 +4,6 @@ using System.Net.Http.Headers; using System.Runtime.CompilerServices; using System.Text; -using Azure.Identity; -using Azure.Storage.Blobs; using Newtonsoft.Json; using Polly; using Polly.Retry; diff --git a/src/TesApi.Tests/TaskServiceApiControllerTests.cs b/src/TesApi.Tests/TaskServiceApiControllerTests.cs index 239aca283..209b2d289 100644 --- a/src/TesApi.Tests/TaskServiceApiControllerTests.cs +++ b/src/TesApi.Tests/TaskServiceApiControllerTests.cs @@ -316,7 +316,7 @@ public async Task CancelTaskAsync_ReturnsConflict_ForRepositoryCollision() // Mock UpdateItemAsync to throw a RepositoryCollisionException r.Setup(repo => repo.UpdateItemAsync(It.IsAny(), It.IsAny())) - .ThrowsAsync(new RepositoryCollisionException(Task.FromResult(default))); + .ThrowsAsync(new RepositoryCollisionException(default)); }); var controller = services.GetT(); diff --git a/src/TesApi.Web/Controllers/TaskServiceApi.cs b/src/TesApi.Web/Controllers/TaskServiceApi.cs index b283c2469..da82a6c69 100644 --- a/src/TesApi.Web/Controllers/TaskServiceApi.cs +++ b/src/TesApi.Web/Controllers/TaskServiceApi.cs @@ -266,7 +266,7 @@ public virtual async Task CreateTaskAsync([FromBody] TesTask tesT logger.LogDebug("Creating task with id {TesTask} state {TesTaskState}", tesTask.Id, tesTask.State); tesTask = await repository.CreateItemAsync(tesTask, cancellationToken); - await taskScheduler.ProcessQueuedTesTaskAsync(tesTask, cancellationToken); + taskScheduler.QueueTesTask(tesTask); return StatusCode(200, new TesCreateTaskResponse { Id = tesTask.Id }); } diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 71cf7e222..01e4f373c 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -134,10 +134,13 @@ protected async Task ExecuteActionOnIntervalAsync(TimeSpan runInterval, Func /// Tag to disambiguate the state and/or action workflow performed in log messages. /// . - /// A for controlling the lifetime of the asynchronous operation. + /// /// - protected async ValueTask ProcessOrchestratedTesTaskAsync(string pollName, RelatedTask task, CancellationToken cancellationToken) + /// A for controlling the lifetime of the asynchronous operation. + protected async ValueTask ProcessOrchestratedTesTaskAsync(string pollName, RelatedTask task, Func, ValueTask> requeue, CancellationToken cancellationToken) { + ArgumentNullException.ThrowIfNull(requeue); + var tesTask = task.Related; try @@ -251,27 +254,19 @@ protected async ValueTask ProcessOrchestratedTesTaskAsync(string pollName, Relat } catch (RepositoryCollisionException rce) { - Logger.LogError(rce, "RepositoryCollisionException in OrchestrateTesTasksOnBatch({Poll})", pollName); + Logger.LogInformation(rce, "RepositoryCollisionException in OrchestrateTesTasksOnBatch({Poll})", pollName); try { - var currentTesTask = await rce.Task; + var currentTesTask = rce.RepositoryItem; - if (currentTesTask is not null && currentTesTask.IsActiveState()) + if (currentTesTask is not null) { - currentTesTask.SetWarning(rce.Message); - - if (currentTesTask.IsActiveState()) - { - // TODO: merge tesTask and currentTesTask - } - - await Repository.UpdateItemAsync(currentTesTask, cancellationToken); + await requeue(rce); } } catch (Exception exc) { - // Consider retrying repository.UpdateItemAsync() if this exception was thrown from 'await rce.Task' Logger.LogError(exc, "Updating TES Task '{TesTask}' threw {ExceptionType}: '{ExceptionMessage}'. Stack trace: {ExceptionStackTrace}", tesTask.Id, exc.GetType().FullName, exc.Message, exc.StackTrace); } } @@ -302,18 +297,21 @@ static bool IsExceptionHttpConflictWhereTaskIsComplete(Exception exc) /// Tag to disambiguate the state and/or action workflow performed in log messages. /// Provides array of s on which to perform actions through . /// Method operating on returning indicating if each needs updating into the repository. + /// /// A for controlling the lifetime of the asynchronous operation. /// Tag to indicate the underlying unit quantity of items processed in log messages. - /// True to process even if there are no tasks processed. /// A that represents this method's operations. + /// True to process even if there are no tasks processed. protected async ValueTask OrchestrateTesTasksOnBatchAsync( string pollName, Func>> tesTaskGetter, Func>> tesTaskProcessor, + Func, ValueTask> requeue, CancellationToken cancellationToken, - string unitsLabel = "tasks", - bool needPoolFlush = false) + string unitsLabel = "tasks", bool needPoolFlush = false) { + ArgumentNullException.ThrowIfNull(requeue); + var tesTasks = await (await tesTaskGetter(cancellationToken)).ToArrayAsync(cancellationToken); var noTasks = tesTasks.All(task => task is null); @@ -327,7 +325,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync( if (!noTasks) { - await Parallel.ForEachAsync(tesTaskProcessor(tesTasks, cancellationToken), cancellationToken, (task, token) => ProcessOrchestratedTesTaskAsync(pollName, task, token)); + await Parallel.ForEachAsync(tesTaskProcessor(tesTasks, cancellationToken), cancellationToken, (task, token) => ProcessOrchestratedTesTaskAsync(pollName, task, requeue, token)); } if (BatchScheduler.NeedPoolFlush) diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index 5efc0842d..82ea0cb81 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -7,6 +7,7 @@ using System.Linq; using System.Threading; using System.Threading.Tasks; +using CommonUtilities; using Microsoft.Azure.Batch; using Microsoft.Azure.Batch.Common; using Microsoft.Extensions.Logging; @@ -159,14 +160,14 @@ private async ValueTask ProcessTasksAsync(IBatchPool pool, DateTime now, IEnumer /// private async ValueTask ProcessCloudTaskStatesAsync(string poolId, IAsyncEnumerable states, CancellationToken cancellationToken) { - var list = new ConcurrentBag<(TesTask TesTask, AzureBatchTaskState State)>(); + ConcurrentBag<(TesTask TesTask, AzureBatchTaskState State)> tasksAndStates = []; await Parallel.ForEachAsync(states, cancellationToken, async (state, token) => { TesTask tesTask = default; if (await Repository.TryGetItemAsync(BatchScheduler.GetTesTaskIdFromCloudTaskId(state.CloudTaskId), token, task => tesTask = task) && tesTask is not null) { - list.Add((tesTask, state.TaskState)); + tasksAndStates.Add((tesTask, state.TaskState)); } else { @@ -174,13 +175,41 @@ await Parallel.ForEachAsync(states, cancellationToken, async (state, token) => } }); - if (!list.IsEmpty) + if (!tasksAndStates.IsEmpty) { - await OrchestrateTesTasksOnBatchAsync( - $"NodeState ({poolId})", - _ => ValueTask.FromResult(list.Select(t => t.TesTask).ToAsyncEnumerable()), - (tesTasks, token) => TaskScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, list.Select(t => t.State).ToArray(), token), - cancellationToken); + ConcurrentBag requeues = []; + Dictionary statesByTask = new(StringComparer.Ordinal); + List tasks = []; + + tasksAndStates.ForEach(t => + { + tasks.Add(t.TesTask); + statesByTask.Add(t.TesTask.Id, t.State); + }); + + do + { + requeues.Clear(); + + await OrchestrateTesTasksOnBatchAsync( + $"NodeState ({poolId})", + _ => ValueTask.FromResult(tasks.ToAsyncEnumerable()), + (tesTasks, token) => TaskScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, tesTasks.Select(task => statesByTask[task.Id]).ToArray(), token), + ex => { requeues.Add(ex.RepositoryItem.Id); return ValueTask.CompletedTask; }, cancellationToken); + + tasks.Clear(); + + await Parallel.ForEachAsync(requeues, cancellationToken, async (id, token) => + { + TesTask tesTask = default; + + if (await Repository.TryGetItemAsync(id, token, task => tesTask = task)) + { + tasks.Add(tesTask); + } + }); + } + while (!requeues.IsEmpty); } else { diff --git a/src/TesApi.Web/RepositoryRetryHandler.cs b/src/TesApi.Web/RepositoryRetryHandler.cs index c9dc9303b..c4963f9e2 100644 --- a/src/TesApi.Web/RepositoryRetryHandler.cs +++ b/src/TesApi.Web/RepositoryRetryHandler.cs @@ -34,7 +34,9 @@ public RepositoryRetryHandler(IRepository repository, IOptions(ex => ex is not RepositoryCollisionException)) + .WithRetryPolicyOptionsWait() .SetOnRetryBehavior(logger) .AsyncBuild(); _repository = repository; diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index 20d8005f4..15a96a2d6 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -8,6 +8,7 @@ using System.Threading; using System.Threading.Channels; using System.Threading.Tasks; +using CommonUtilities; using Microsoft.Extensions.Logging; using Tes.Models; using Tes.Repository; @@ -26,8 +27,7 @@ public interface ITaskScheduler /// Schedules a /// /// A to schedule on the batch system. - /// A for controlling the lifetime of the asynchronous operation. - Task ProcessQueuedTesTaskAsync(TesTask tesTask, CancellationToken cancellationToken); + void QueueTesTask(TesTask tesTask); /// /// Updates s with task-related state @@ -124,7 +124,7 @@ protected override async ValueTask ExecuteSetupAsync(CancellationToken cancellat } catch (Exception ex) { - await ProcessOrchestratedTesTaskAsync("Initialization", new(Task.FromException(ex), tesTask), cancellationToken); + await ProcessOrchestratedTesTaskAsync("Initialization", new(Task.FromException(ex), tesTask), ex => { Logger.LogCritical(ex, "Unexpected repository failure in initialization with {TesTask}", ex.RepositoryItem.Id); return ValueTask.CompletedTask; }, cancellationToken); } } @@ -198,7 +198,17 @@ private async ValueTask ProcessQueuedTesTasksAsync(CancellationToken cancellatio { while (!cancellationToken.IsCancellationRequested && queuedTesTasks.TryDequeue(out var tesTask)) { - await ProcessOrchestratedTesTaskAsync("Queued", new(BatchScheduler.ProcessQueuedTesTaskAsync(tesTask, cancellationToken), tesTask), cancellationToken); + await ProcessOrchestratedTesTaskAsync("Queued", new(BatchScheduler.ProcessQueuedTesTaskAsync(tesTask, cancellationToken), tesTask), Requeue, cancellationToken); + } + + async ValueTask Requeue(RepositoryCollisionException exception) + { + TesTask tesTask = default; + + if (await Repository.TryGetItemAsync(exception.RepositoryItem.Id, cancellationToken, task => tesTask = task) && (tesTask?.IsActiveState() ?? false) && tesTask?.State != TesState.CANCELING) + { + queuedTesTasks.Enqueue(tesTask); + } } } @@ -239,14 +249,43 @@ private Task ExecuteCancelledTesTasksOnBatchAsync(CancellationToken cancellation .ToAsyncEnumerable()); return ExecuteActionOnIntervalAsync(BatchRunInterval, - token => OrchestrateTesTasksOnBatchAsync( - "Cancelled", - query, - (tasks, ct) => ((ITaskScheduler)this).ProcessTesTaskBatchStatesAsync( - tasks, - Enumerable.Repeat(new(AzureBatchTaskState.TaskState.CancellationRequested), tasks.Length).ToArray(), - ct), - token), + async token => + { + ConcurrentBag requeues = []; + List tasks = []; + + await foreach (var task in await query(cancellationToken)) + { + tasks.Add(task); + } + + do + { + requeues.Clear(); + + await OrchestrateTesTasksOnBatchAsync( + "Cancelled", + query, + (tasks, ct) => ((ITaskScheduler)this).ProcessTesTaskBatchStatesAsync( + tasks, + Enumerable.Repeat(new(AzureBatchTaskState.TaskState.CancellationRequested), tasks.Length).ToArray(), + ct), + ex => { requeues.Add(ex.RepositoryItem.Id); return ValueTask.CompletedTask; }, token); + + tasks.Clear(); + + await Parallel.ForEachAsync(requeues, cancellationToken, async (id, token) => + { + TesTask tesTask = default; + + if (await Repository.TryGetItemAsync(id, token, task => tesTask = task)) + { + tasks.Add(tesTask); + } + }); + } + while (!requeues.IsEmpty); + }, cancellationToken); } @@ -361,14 +400,43 @@ private async ValueTask UpdateTesTasksFromAvailableEventsAsync(IEnumerable<(TesT return; } - // Update TesTasks - await OrchestrateTesTasksOnBatchAsync( + ConcurrentBag requeues = []; + Dictionary MarkProcessedAsync)> statesByTask = new(StringComparer.Ordinal); + List tasks = []; + + eventStates.ForEach(t => + { + tasks.Add(t.Task); + statesByTask.Add(t.Task.Id, (t.State, t.MarkProcessedAsync)); + }); + + do + { + requeues.Clear(); + + + // Update TesTasks + await OrchestrateTesTasksOnBatchAsync( "NodeEvent", - _ => ValueTask.FromResult(eventStates.Select(@event => @event.Task).ToAsyncEnumerable()), - (tesTasks, token) => ((ITaskScheduler)this).ProcessTesTaskBatchStatesAsync(tesTasks, eventStates.Select(@event => @event.State).ToArray(), token), - cancellationToken, + _ => ValueTask.FromResult(tasks.ToAsyncEnumerable()), + (tesTasks, token) => ((ITaskScheduler)this).ProcessTesTaskBatchStatesAsync(tesTasks, tesTasks.Select(task => statesByTask[task.Id].State).ToArray(), token), + ex => { requeues.Add(ex.RepositoryItem.Id); return ValueTask.CompletedTask; }, cancellationToken, "events"); + tasks.Clear(); + + await Parallel.ForEachAsync(requeues, cancellationToken, async (id, token) => + { + TesTask tesTask = default; + + if (await Repository.TryGetItemAsync(id, token, task => tesTask = task)) + { + tasks.Add(tesTask); + } + }); + } + while (!requeues.IsEmpty); + await Parallel.ForEachAsync(eventStates.Select(@event => @event.MarkProcessedAsync).Where(func => func is not null), cancellationToken, async (markEventProcessed, token) => { try @@ -387,10 +455,9 @@ await Parallel.ForEachAsync(eventStates.Select(@event => @event.MarkProcessedAsy } /// - Task ITaskScheduler.ProcessQueuedTesTaskAsync(TesTask tesTask, CancellationToken cancellationToken) + void ITaskScheduler.QueueTesTask(TesTask tesTask) { queuedTesTasks.Enqueue(tesTask); - return Task.CompletedTask; } /// From a6d942043e29e0ad3441e50a6cdf63d3c6bb2d41 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 31 Oct 2024 17:15:29 -0700 Subject: [PATCH 164/202] Fix task event processing loop --- src/TesApi.Web/PoolScheduler.cs | 11 +++--- src/TesApi.Web/TaskScheduler.cs | 59 +++++++++++++++++++++++---------- 2 files changed, 48 insertions(+), 22 deletions(-) diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index 82ea0cb81..0ebc7b155 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -190,24 +190,27 @@ await Parallel.ForEachAsync(states, cancellationToken, async (state, token) => do { requeues.Clear(); - await OrchestrateTesTasksOnBatchAsync( $"NodeState ({poolId})", _ => ValueTask.FromResult(tasks.ToAsyncEnumerable()), (tesTasks, token) => TaskScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, tesTasks.Select(task => statesByTask[task.Id]).ToArray(), token), ex => { requeues.Add(ex.RepositoryItem.Id); return ValueTask.CompletedTask; }, cancellationToken); - tasks.Clear(); - + // Fetch updated TesTasks from the repository + ConcurrentBag requeuedTasks = []; await Parallel.ForEachAsync(requeues, cancellationToken, async (id, token) => { TesTask tesTask = default; if (await Repository.TryGetItemAsync(id, token, task => tesTask = task)) { - tasks.Add(tesTask); + requeuedTasks.Add(tesTask); } }); + + // Stage next loop + tasks.Clear(); + requeuedTasks.ForEach(tasks.Add); } while (!requeues.IsEmpty); } diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index 15a96a2d6..d2bc8658f 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Concurrent; using System.Collections.Generic; +using System.Collections.Immutable; using System.Linq; using System.Threading; using System.Threading.Channels; @@ -262,7 +263,6 @@ private Task ExecuteCancelledTesTasksOnBatchAsync(CancellationToken cancellation do { requeues.Clear(); - await OrchestrateTesTasksOnBatchAsync( "Cancelled", query, @@ -272,17 +272,21 @@ await OrchestrateTesTasksOnBatchAsync( ct), ex => { requeues.Add(ex.RepositoryItem.Id); return ValueTask.CompletedTask; }, token); - tasks.Clear(); - + // Fetch updated TesTasks from the repository + ConcurrentBag requeuedTasks = []; await Parallel.ForEachAsync(requeues, cancellationToken, async (id, token) => { TesTask tesTask = default; if (await Repository.TryGetItemAsync(id, token, task => tesTask = task)) { - tasks.Add(tesTask); + requeuedTasks.Add(tesTask); } }); + + // Stage next loop + tasks.Clear(); + requeuedTasks.ForEach(tasks.Add); } while (!requeues.IsEmpty); }, @@ -401,39 +405,58 @@ private async ValueTask UpdateTesTasksFromAvailableEventsAsync(IEnumerable<(TesT } ConcurrentBag requeues = []; - Dictionary MarkProcessedAsync)> statesByTask = new(StringComparer.Ordinal); - List tasks = []; + ConcurrentDictionary MarkProcessedAsync)>> statesByTask = new(StringComparer.Ordinal); + HashSet tasks = []; eventStates.ForEach(t => { - tasks.Add(t.Task); - statesByTask.Add(t.Task.Id, (t.State, t.MarkProcessedAsync)); + _ = tasks.Add(t.Task); + _ = statesByTask.AddOrUpdate(t.Task.Id, _ => [(t.State, t.MarkProcessedAsync)], (_, array) => array.Add((t.State, t.MarkProcessedAsync))); }); do { + // Update TesTasks one event each per loop requeues.Clear(); - - - // Update TesTasks await OrchestrateTesTasksOnBatchAsync( - "NodeEvent", - _ => ValueTask.FromResult(tasks.ToAsyncEnumerable()), - (tesTasks, token) => ((ITaskScheduler)this).ProcessTesTaskBatchStatesAsync(tesTasks, tesTasks.Select(task => statesByTask[task.Id].State).ToArray(), token), - ex => { requeues.Add(ex.RepositoryItem.Id); return ValueTask.CompletedTask; }, cancellationToken, - "events"); + "NodeEvent", + _ => ValueTask.FromResult(tasks.ToAsyncEnumerable()), + (tesTasks, token) => ((ITaskScheduler)this).ProcessTesTaskBatchStatesAsync(tesTasks, tesTasks.Select(task => statesByTask[task.Id][0].State).ToArray(), token), + ex => { requeues.Add(ex.RepositoryItem.Id); return ValueTask.CompletedTask; }, + cancellationToken, + "events"); + + // Get next state for each task (if any) for next loop + _ = Parallel.ForEach(tasks, task => + { + // Don't remove current state if there was a repository conflict + if (!requeues.Contains(task.Id)) + { + var states = statesByTask[task.Id].RemoveAt(0); - tasks.Clear(); + if (!states.IsEmpty) + { + statesByTask[task.Id] = states; + requeues.Add(task.Id); + } + } + }); + // Fetch updated TesTasks from the repository + ConcurrentBag requeuedTasks = []; await Parallel.ForEachAsync(requeues, cancellationToken, async (id, token) => { TesTask tesTask = default; if (await Repository.TryGetItemAsync(id, token, task => tesTask = task)) { - tasks.Add(tesTask); + requeuedTasks.Add(tesTask); } }); + + // Stage next loop + tasks.Clear(); + requeuedTasks.ForEach(task => _ = tasks.Add(task)); } while (!requeues.IsEmpty); From d19d95edec4d9acddc0862a7f35f325922d513c8 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 1 Nov 2024 12:04:46 -0700 Subject: [PATCH 165/202] Logging formatting and level updates --- src/Tes.ApiClients/DrsHubApiClient.cs | 4 ++-- src/Tes.ApiClients/TerraSamApiClient.cs | 2 +- .../PostgreSqlCachingRepository.cs | 4 ++-- .../Authentication/CredentialsManager.cs | 4 ++-- .../ContainerRegistryAuthorizationManager.cs | 4 ++-- .../Docker/VolumeBindingsGenerator.cs | 4 ++-- src/Tes.Runner/Events/EventSink.cs | 8 +++---- src/Tes.Runner/Events/EventsPublisher.cs | 2 +- src/Tes.Runner/Executor.cs | 14 ++++++------ .../Storage/FileOperationResolver.cs | 4 ++-- .../Storage/TerraUrlTransformationStrategy.cs | 16 +++++++------- src/Tes.Runner/Transfer/BlobDownloader.cs | 2 +- .../Transfer/BlobOperationPipeline.cs | 10 ++++----- .../Transfer/DefaultFileInfoProvider.cs | 14 ++++++------ .../Transfer/Md5HashListProvider.cs | 2 +- src/Tes.Runner/Transfer/PartsProcessor.cs | 6 ++--- src/Tes.Runner/Transfer/PartsProducer.cs | 2 +- src/Tes.Runner/Transfer/PartsReader.cs | 2 +- src/Tes.Runner/Transfer/PartsWriter.cs | 2 +- .../Transfer/ProcessedPartsProcessor.cs | 4 ++-- src/Tes.RunnerCLI/Commands/CommandHandlers.cs | 6 ++--- src/Tes.RunnerCLI/Commands/ProcessLauncher.cs | 4 ++-- src/TesApi.Web/AllowedVmSizesService.cs | 2 +- src/TesApi.Web/AzureProxy.cs | 10 ++++----- src/TesApi.Web/BatchPool.cs | 8 +++---- src/TesApi.Web/BatchPools.BatchScheduler.cs | 2 +- src/TesApi.Web/BatchScheduler.cs | 20 ++++++++--------- src/TesApi.Web/Controllers/TaskServiceApi.cs | 4 ++-- .../OperationCancelledExceptionFilter.cs | 2 +- .../Management/ArmBatchQuotaProvider.cs | 2 +- .../Management/Batch/ArmBatchPoolManager.cs | 8 +++---- .../Management/Batch/PoolMetadataReader.cs | 2 +- .../Management/Batch/TerraBatchPoolManager.cs | 4 ++-- .../PriceApiBatchSkuInformationProvider.cs | 18 +++++++-------- src/TesApi.Web/PoolScheduler.cs | 12 +++++----- src/TesApi.Web/Program.cs | 8 +++++++ .../Runner/TaskExecutionScriptingManager.cs | 6 ++--- .../Runner/TaskToNodeTaskConverter.cs | 22 +++++++++---------- .../Storage/TerraStorageAccessProvider.cs | 14 ++++++------ src/TesApi.Web/TaskScheduler.cs | 12 +++++----- src/TesApi.Web/TerraActionIdentityProvider.cs | 4 ++-- src/TesApi.Web/appsettings.json | 10 ++++----- 42 files changed, 149 insertions(+), 141 deletions(-) diff --git a/src/Tes.ApiClients/DrsHubApiClient.cs b/src/Tes.ApiClients/DrsHubApiClient.cs index cba923385..9a66c22e2 100644 --- a/src/Tes.ApiClients/DrsHubApiClient.cs +++ b/src/Tes.ApiClients/DrsHubApiClient.cs @@ -57,7 +57,7 @@ public virtual async Task ResolveDrsUriAsync(Uri drsUri, { var apiUrl = GetResolveDrsApiUrl(); - Logger.LogInformation(@"Resolving DRS URI calling: {uri}", apiUrl); + Logger.LogDebug(@"Resolving DRS URI calling: {uri}", apiUrl); response = await HttpSendRequestWithRetryPolicyAsync(() => new HttpRequestMessage(HttpMethod.Post, apiUrl) { Content = GetDrsResolveRequestContent(drsUri) }, @@ -65,7 +65,7 @@ public virtual async Task ResolveDrsUriAsync(Uri drsUri, var apiResponse = await GetDrsResolveApiResponseAsync(response, cancellationToken); - Logger.LogInformation(@"Successfully resolved URI: {drsUri}", drsUri); + Logger.LogDebug(@"Successfully resolved URI: {drsUri}", drsUri); return apiResponse; } diff --git a/src/Tes.ApiClients/TerraSamApiClient.cs b/src/Tes.ApiClients/TerraSamApiClient.cs index 254ba5122..38c26fdc3 100644 --- a/src/Tes.ApiClients/TerraSamApiClient.cs +++ b/src/Tes.ApiClients/TerraSamApiClient.cs @@ -52,7 +52,7 @@ private async Task GetActionManagedIdentity var url = GetSamActionManagedIdentityUrl(resourceType, resourceId, action); - Logger.LogInformation(@"Fetching action managed identity from Sam for {resourceId}", resourceId); + Logger.LogDebug(@"Fetching action managed identity from Sam for {resourceId}", resourceId); try { diff --git a/src/Tes.Repository/PostgreSqlCachingRepository.cs b/src/Tes.Repository/PostgreSqlCachingRepository.cs index 6b36420b6..42fdc7ac4 100644 --- a/src/Tes.Repository/PostgreSqlCachingRepository.cs +++ b/src/Tes.Repository/PostgreSqlCachingRepository.cs @@ -62,7 +62,7 @@ protected PostgreSqlCachingRepository(Microsoft.Extensions.Hosting.IHostApplicat writerWorkerTask = Task.Run(() => WriterWorkerAsync(writerWorkerCancellationTokenSource.Token)) .ContinueWith(async task => { - Logger?.LogInformation("The repository WriterWorkerAsync ended with TaskStatus: {TaskStatus}", task.Status); + Logger?.LogDebug("The repository WriterWorkerAsync ended with TaskStatus: {TaskStatus}", task.Status); if (task.Status == TaskStatus.Faulted) { @@ -77,7 +77,7 @@ protected PostgreSqlCachingRepository(Microsoft.Extensions.Hosting.IHostApplicat await Task.Delay(TimeSpan.FromSeconds(40)); // Give the logger time to flush; default flush is 30s hostApplicationLifetime?.StopApplication(); }, TaskContinuationOptions.NotOnCanceled) - .ContinueWith(task => Logger?.LogInformation("The repository WriterWorkerAsync ended normally"), TaskContinuationOptions.OnlyOnCanceled); + .ContinueWith(task => Logger?.LogDebug("The repository WriterWorkerAsync ended normally"), TaskContinuationOptions.OnlyOnCanceled); } /// diff --git a/src/Tes.Runner/Authentication/CredentialsManager.cs b/src/Tes.Runner/Authentication/CredentialsManager.cs index b4c87dcb8..ce2cf574b 100644 --- a/src/Tes.Runner/Authentication/CredentialsManager.cs +++ b/src/Tes.Runner/Authentication/CredentialsManager.cs @@ -65,7 +65,7 @@ private TokenCredential GetTokenCredentialImpl(string? managedIdentityResourceId if (!string.IsNullOrWhiteSpace(managedIdentityResourceId)) { - logger.LogInformation("Token credentials with Managed Identity and resource ID: {NodeManagedIdentityResourceId}", managedIdentityResourceId); + logger.LogDebug("Token credentials with Managed Identity and resource ID: {NodeManagedIdentityResourceId}", managedIdentityResourceId); var tokenCredentialOptions = new TokenCredentialOptions { AuthorityHost = authorityHost }; tokenCredential = new ManagedIdentityCredential( @@ -74,7 +74,7 @@ private TokenCredential GetTokenCredentialImpl(string? managedIdentityResourceId } else { - logger.LogInformation("Token credentials with DefaultAzureCredential"); + logger.LogDebug("Token credentials with DefaultAzureCredential"); var defaultAzureCredentialOptions = new DefaultAzureCredentialOptions { AuthorityHost = authorityHost }; tokenCredential = new DefaultAzureCredential(defaultAzureCredentialOptions); } diff --git a/src/Tes.Runner/Docker/ContainerRegistryAuthorizationManager.cs b/src/Tes.Runner/Docker/ContainerRegistryAuthorizationManager.cs index be21c877a..62d150a08 100644 --- a/src/Tes.Runner/Docker/ContainerRegistryAuthorizationManager.cs +++ b/src/Tes.Runner/Docker/ContainerRegistryAuthorizationManager.cs @@ -48,11 +48,11 @@ public ContainerRegistryAuthorizationManager(CredentialsManager tokenCredentials if (string.IsNullOrWhiteSpace(acrAccessToken)) { - logger.LogInformation(@"The ACR instance is public. No authorization is required. Registry: {RegistryEndpoint}", registryAddress); + logger.LogDebug(@"The ACR instance is public. No authorization is required. Registry: {RegistryEndpoint}", registryAddress); return null; // image is available anonymously } - logger.LogInformation(@"The ACR instance is private. An access token was successfully obtained. Registry: {RegistryEndpoint}", registryAddress); + logger.LogDebug(@"The ACR instance is private. An access token was successfully obtained. Registry: {RegistryEndpoint}", registryAddress); return new AuthConfig { diff --git a/src/Tes.Runner/Docker/VolumeBindingsGenerator.cs b/src/Tes.Runner/Docker/VolumeBindingsGenerator.cs index 708ce16b0..4cee5639c 100644 --- a/src/Tes.Runner/Docker/VolumeBindingsGenerator.cs +++ b/src/Tes.Runner/Docker/VolumeBindingsGenerator.cs @@ -69,7 +69,7 @@ private void AddVolumeBindingIfRequired(HashSet volumeBindings, string p if (!expandedPath.StartsWith(mountParentDirectory)) { - logger.LogDebug( + logger.LogTrace( @"The expanded path value {ExpandedPath} does not contain the specified mount parent directory: {MountParentDirectory}. No volume binding will be created for this file in the container.", expandedPath, mountParentDirectory); return default; @@ -79,7 +79,7 @@ private void AddVolumeBindingIfRequired(HashSet volumeBindings, string p var volBinding = $"{mountParentDirectory.TrimEnd('/')}/{targetDir}:/{targetDir}"; - logger.LogDebug(@"Volume binding for {ExpandedPath} is {VolBinding}", expandedPath, volBinding); + logger.LogTrace(@"Volume binding for {ExpandedPath} is {VolBinding}", expandedPath, volBinding); return volBinding; } diff --git a/src/Tes.Runner/Events/EventSink.cs b/src/Tes.Runner/Events/EventSink.cs index 3a61957fc..c989e1f58 100644 --- a/src/Tes.Runner/Events/EventSink.cs +++ b/src/Tes.Runner/Events/EventSink.cs @@ -26,7 +26,7 @@ public async Task PublishEventAsync(EventMessage eventMessage) public void Start() { - logger.LogDebug("Starting events processing handler"); + logger.LogTrace("Starting events processing handler"); eventHandlerTask = Task.Run(async () => await EventHandlerAsync()); } @@ -41,7 +41,7 @@ public async Task StopAsync() if (eventHandlerTask.IsCompleted) { - logger.LogDebug("Task already completed"); + logger.LogTrace("Task already completed"); return; } @@ -70,11 +70,11 @@ private async Task EventHandlerAsync() { try { - logger.LogDebug($"Handling event. Event Name: {eventMessage.Name} Event ID: {eventMessage.Id} "); + logger.LogTrace($"Handling event. Event Name: {eventMessage.Name} Event ID: {eventMessage.Id} "); await HandleEventAsync(eventMessage); - logger.LogDebug($"Event handled. Event Name: {eventMessage.Name} Event ID: {eventMessage.Id} "); + logger.LogTrace($"Event handled. Event Name: {eventMessage.Name} Event ID: {eventMessage.Id} "); } catch (Exception e) { diff --git a/src/Tes.Runner/Events/EventsPublisher.cs b/src/Tes.Runner/Events/EventsPublisher.cs index 39e4f7364..807a943f0 100644 --- a/src/Tes.Runner/Events/EventsPublisher.cs +++ b/src/Tes.Runner/Events/EventsPublisher.cs @@ -224,7 +224,7 @@ private async Task PublishAsync(EventMessage message) foreach (var sink in sinks) { - logger.LogInformation("Publishing event {MessageName} to sink: {SinkType}", message.Name, sink.GetType().Name); + logger.LogDebug("Publishing event {MessageName} to sink: {SinkType}", message.Name, sink.GetType().Name); await sink.PublishEventAsync(message); } diff --git a/src/Tes.Runner/Executor.cs b/src/Tes.Runner/Executor.cs index ff4f31c4c..440df85e2 100644 --- a/src/Tes.Runner/Executor.cs +++ b/src/Tes.Runner/Executor.cs @@ -170,7 +170,7 @@ private async Task UploadOutputsAsync(BlobPipelineOptions blobPip var executionResult = await TimedExecutionAsync(async () => await uploader.UploadAsync(outputs)); - logger.LogInformation("Executed Upload. Time elapsed: {ElapsedTime} Bandwidth: {BandwidthMiBpS} MiB/s", executionResult.Elapsed, BlobSizeUtils.ToBandwidth(executionResult.Result, executionResult.Elapsed.TotalSeconds)); + logger.LogDebug("Executed Upload. Time elapsed: {ElapsedTime} Bandwidth: {BandwidthMiBpS} MiB/s", executionResult.Elapsed, BlobSizeUtils.ToBandwidth(executionResult.Result, executionResult.Elapsed.TotalSeconds)); return new(executionResult.Result, uploader.CompletedFiles); } @@ -179,7 +179,7 @@ private async Task UploadOutputsAsync(BlobPipelineOptions blobPip { if ((tesNodeTask.Outputs ?? []).Count == 0) { - logger.LogInformation("No outputs provided"); + logger.LogDebug("No outputs provided"); { return default; } @@ -270,7 +270,7 @@ private async Task DownloadInputsAsync(BlobPipelineOptions blobPipelineOpt { if (tesNodeTask.Inputs is null || tesNodeTask.Inputs.Count == 0) { - logger.LogInformation("No inputs provided"); + logger.LogDebug("No inputs provided"); { return default; } @@ -289,10 +289,10 @@ private static void ValidateBlockSize(int blockSizeBytes) private void LogStartConfig(BlobPipelineOptions blobPipelineOptions) { - logger.LogInformation("Writers: {NumberOfWriters}", blobPipelineOptions.NumberOfWriters); - logger.LogInformation("Readers: {NumberOfReaders}", blobPipelineOptions.NumberOfReaders); - logger.LogInformation("Capacity: {ReadWriteBuffersCapacity}", blobPipelineOptions.ReadWriteBuffersCapacity); - logger.LogInformation("BlockSize: {BlockSizeBytes}", blobPipelineOptions.BlockSizeBytes); + logger.LogDebug("Writers: {NumberOfWriters}", blobPipelineOptions.NumberOfWriters); + logger.LogDebug("Readers: {NumberOfReaders}", blobPipelineOptions.NumberOfReaders); + logger.LogDebug("Capacity: {ReadWriteBuffersCapacity}", blobPipelineOptions.ReadWriteBuffersCapacity); + logger.LogDebug("BlockSize: {BlockSizeBytes}", blobPipelineOptions.BlockSizeBytes); } private static async Task> TimedExecutionAsync(Func> execution) diff --git a/src/Tes.Runner/Storage/FileOperationResolver.cs b/src/Tes.Runner/Storage/FileOperationResolver.cs index cd30615c6..bff96084c 100644 --- a/src/Tes.Runner/Storage/FileOperationResolver.cs +++ b/src/Tes.Runner/Storage/FileOperationResolver.cs @@ -159,7 +159,7 @@ private IEnumerable ExpandFileOutput(FileOutput output) if (fileInfoProvider.FileExists(expandedPath)) { //treat the output as a single file and use the target URL as is - logger.LogInformation("Adding file: {ExpandedPath} to the output list with a target URL as is", expandedPath); + logger.LogDebug("Adding file: {ExpandedPath} to the output list with a target URL as is", expandedPath); yield return CreateExpandedFileOutputUsingTargetUrl(output, absoluteFilePath: expandedPath); @@ -182,7 +182,7 @@ private IEnumerable ExpandFileOutput(FileOutput output) foreach (var file in fileInfoProvider.GetFilesBySearchPattern(rootPathPair.Root, rootPathPair.RelativePath)) { - logger.LogInformation("Adding file: {RelativePathToSearchPath} with absolute path: {AbsolutePath} to the output list with a combined target URL", file.RelativePathToSearchPath, file.AbsolutePath); + logger.LogDebug("Adding file: {RelativePathToSearchPath} with absolute path: {AbsolutePath} to the output list with a combined target URL", file.RelativePathToSearchPath, file.AbsolutePath); yield return CreateExpandedFileOutputWithCombinedTargetUrl(output, absoluteFilePath: file.AbsolutePath, relativePathToSearchPath: file.RelativePathToSearchPath); } diff --git a/src/Tes.Runner/Storage/TerraUrlTransformationStrategy.cs b/src/Tes.Runner/Storage/TerraUrlTransformationStrategy.cs index 19887cece..1eca3f59e 100644 --- a/src/Tes.Runner/Storage/TerraUrlTransformationStrategy.cs +++ b/src/Tes.Runner/Storage/TerraUrlTransformationStrategy.cs @@ -80,7 +80,7 @@ private async Task GetMappedSasUrlFromWsmAsync(TerraBlobInfo blobInfo, Blob { var tokenInfo = await GetWorkspaceSasTokenFromWsmAsync(blobInfo, blobSasPermissions); - logger.LogInformation("Successfully obtained the SAS URL from Terra. WSM resource ID:{ContainerResourceId}", blobInfo.WsmContainerResourceId); + logger.LogDebug("Successfully obtained the SAS URL from Terra. WSM resource ID:{ContainerResourceId}", blobInfo.WsmContainerResourceId); var uriBuilder = new UriBuilder(tokenInfo.Url); @@ -99,7 +99,7 @@ private async Task GetWorkspaceSasTokenFromWsmAsync(Terr { var tokenParams = CreateTokenParamsFromOptions(sasBlobPermissions); - logger.LogInformation( + logger.LogDebug( "Getting SAS URL from Terra. WSM workspace ID:{WorkspaceId}", blobInfo.WorkspaceId); var cacheKey = $"{blobInfo.WorkspaceId}-{blobInfo.WsmContainerResourceId}-{tokenParams.SasPermission}"; @@ -111,7 +111,7 @@ private async Task GetWorkspaceSasTokenFromWsmAsync(Terr throw new InvalidOperationException("The value retrieved from the cache is null"); } - logger.LogInformation("SAS URL found in cache. WSM resource ID:{ContainerResourceId}", blobInfo.WsmContainerResourceId); + logger.LogDebug("SAS URL found in cache. WSM resource ID:{ContainerResourceId}", blobInfo.WsmContainerResourceId); return tokenInfo; } @@ -196,11 +196,11 @@ private async Task GetTerraBlobInfoFromContainerNameAsync(string CheckIfAccountIsTerraStorageAccount(blobUriBuilder.AccountName); - logger.LogInformation("Getting Workspace ID from the Container Name: {BlobContainerName}", blobUriBuilder.BlobContainerName); + logger.LogDebug("Getting Workspace ID from the Container Name: {BlobContainerName}", blobUriBuilder.BlobContainerName); var workspaceId = ToWorkspaceId(blobUriBuilder.BlobContainerName); - logger.LogInformation("Workspace ID to use: {WorkspaceId}", workspaceId); + logger.LogDebug("Workspace ID to use: {WorkspaceId}", workspaceId); var wsmContainerResourceId = await GetWsmContainerResourceIdFromCacheOrWsmAsync(workspaceId, blobUriBuilder.BlobContainerName); @@ -242,7 +242,7 @@ private Guid ToWorkspaceId(string segmentsContainerName) private async Task GetWsmContainerResourceIdFromCacheOrWsmAsync(Guid workspaceId, string containerName) { - logger.LogInformation("Getting container resource information from WSM. Workspace ID: {WorkspaceId} Container Name: {BlobContainerName}", workspaceId, containerName); + logger.LogDebug("Getting container resource information from WSM. Workspace ID: {WorkspaceId} Container Name: {BlobContainerName}", workspaceId, containerName); try { @@ -250,7 +250,7 @@ private async Task GetWsmContainerResourceIdFromCacheOrWsmAsync(Guid works if (memoryCache.TryGetValue(cacheKey, out Guid wsmContainerResourceId)) { - logger.LogInformation("Found the container resource ID in cache. Resource ID: {ContainerResourceId} Container Name: {BlobContainerName}", wsmContainerResourceId, containerName); + logger.LogDebug("Found the container resource ID in cache. Resource ID: {ContainerResourceId} Container Name: {BlobContainerName}", wsmContainerResourceId, containerName); return wsmContainerResourceId; } @@ -263,7 +263,7 @@ private async Task GetWsmContainerResourceIdFromCacheOrWsmAsync(Guid works r.ResourceAttributes.AzureStorageContainer.StorageContainerName.Equals(containerName, StringComparison.OrdinalIgnoreCase)).Metadata; - logger.LogInformation("Found the resource ID for storage container resource. Resource ID: {ContainerResourceId} Container Name: {BlobContainerName}", metadata.ResourceId, containerName); + logger.LogDebug("Found the resource ID for storage container resource. Resource ID: {ContainerResourceId} Container Name: {BlobContainerName}", metadata.ResourceId, containerName); var resourceId = Guid.Parse(metadata.ResourceId); diff --git a/src/Tes.Runner/Transfer/BlobDownloader.cs b/src/Tes.Runner/Transfer/BlobDownloader.cs index acf92ce3c..f3d368d8d 100644 --- a/src/Tes.Runner/Transfer/BlobDownloader.cs +++ b/src/Tes.Runner/Transfer/BlobDownloader.cs @@ -108,7 +108,7 @@ public override async Task GetSourceLengthAsync(string source) /// public override Task OnCompletionAsync(long length, Uri? blobUrl, string fileName, string? rootHash, string? contentMd5) { - Logger.LogInformation($"Completed download. Total bytes: {length:n0} Filename: {fileName}"); + Logger.LogDebug($"Completed download. Total bytes: {length:n0} Filename: {fileName}"); return Task.CompletedTask; } diff --git a/src/Tes.Runner/Transfer/BlobOperationPipeline.cs b/src/Tes.Runner/Transfer/BlobOperationPipeline.cs index f7011f203..a79b69a94 100644 --- a/src/Tes.Runner/Transfer/BlobOperationPipeline.cs +++ b/src/Tes.Runner/Transfer/BlobOperationPipeline.cs @@ -64,13 +64,13 @@ protected BlobOperationPipeline(BlobPipelineOptions pipelineOptions, Channel ExecutePipelineAsync(List operatio { await WhenAllFailFast(pipelineTasks); - Logger.LogInformation("Pipeline processing completed."); + Logger.LogDebug("Pipeline processing completed."); } catch (Exception e) { @@ -99,9 +99,9 @@ protected async Task ExecutePipelineAsync(List operatio throw; } - Logger.LogInformation("Waiting for processed part processor to complete."); + Logger.LogDebug("Waiting for processed part processor to complete."); var bytesProcessed = await processedPartsProcessorTask; - Logger.LogInformation("Processed parts completed."); + Logger.LogDebug("Processed parts completed."); return bytesProcessed; } diff --git a/src/Tes.Runner/Transfer/DefaultFileInfoProvider.cs b/src/Tes.Runner/Transfer/DefaultFileInfoProvider.cs index 69d6b140d..cd71368b4 100644 --- a/src/Tes.Runner/Transfer/DefaultFileInfoProvider.cs +++ b/src/Tes.Runner/Transfer/DefaultFileInfoProvider.cs @@ -15,25 +15,25 @@ public class DefaultFileInfoProvider : IFileInfoProvider public long GetFileSize(string fileName) { - logger.LogDebug("Getting file size for file: {Path}", fileName); + logger.LogTrace("Getting file size for file: {Path}", fileName); return GetFileInfoOrThrowIfFileDoesNotExist(fileName).Length; } public string GetExpandedFileName(string fileName) { - logger.LogDebug("Expanding file name: {Path}", fileName); + logger.LogTrace("Expanding file name: {Path}", fileName); var expandedValue = Environment.ExpandEnvironmentVariables(fileName); - logger.LogDebug("Expanded file name: {ExpandedPath}", expandedValue); + logger.LogTrace("Expanded file name: {ExpandedPath}", expandedValue); return expandedValue; } public bool FileExists(string fileName) { - logger.LogDebug("Checking if file exists: {Path}", fileName); + logger.LogTrace("Checking if file exists: {Path}", fileName); var fileInfo = new FileInfo(Environment.ExpandEnvironmentVariables(fileName)); @@ -43,7 +43,7 @@ public bool FileExists(string fileName) public List GetFilesBySearchPattern(string searchPath, string searchPattern) { - logger.LogInformation("Searching for files in the search path: {Path} with search pattern: {SearchPattern}", searchPath, searchPattern); + logger.LogDebug("Searching for files in the search path: {Path} with search pattern: {SearchPattern}", searchPath, searchPattern); return Directory.GetFiles(Environment.ExpandEnvironmentVariables(searchPath), Environment.ExpandEnvironmentVariables(searchPattern), SearchOption.AllDirectories) .Select(f => new FileResult(f, ToRelativePathToSearchPath(searchPath, searchPattern, f), searchPath)) @@ -62,7 +62,7 @@ private string ToRelativePathToSearchPath(string searchPath, string searchPatter if (!string.IsNullOrWhiteSpace(prefixToRemove) && absolutePath.StartsWith(prefixToRemove)) { - logger.LogInformation("Removing prefix: {Prefix} from absolute path: {Path}", prefixToRemove, absolutePath); + logger.LogDebug("Removing prefix: {Prefix} from absolute path: {Path}", prefixToRemove, absolutePath); return absolutePath[(prefixToRemove.Length + 1)..]; } @@ -74,7 +74,7 @@ public List GetAllFilesInDirectory(string path) { var expandedPath = Environment.ExpandEnvironmentVariables(path); - logger.LogInformation("Getting all files in directory: {Path}", expandedPath); + logger.LogDebug("Getting all files in directory: {Path}", expandedPath); if (!Directory.Exists(expandedPath)) { diff --git a/src/Tes.Runner/Transfer/Md5HashListProvider.cs b/src/Tes.Runner/Transfer/Md5HashListProvider.cs index b6717b851..11a99a34c 100644 --- a/src/Tes.Runner/Transfer/Md5HashListProvider.cs +++ b/src/Tes.Runner/Transfer/Md5HashListProvider.cs @@ -43,7 +43,7 @@ public string GetRootHash() var rootHash = CreateBlockMd5CheckSumValue(data, 0, data.Length); - logger.LogInformation($"Root Hash: {rootHash} set in property: {BlobApiHttpUtils.RootHashMetadataName}"); + logger.LogDebug($"Root Hash: {rootHash} set in property: {BlobApiHttpUtils.RootHashMetadataName}"); return rootHash; } diff --git a/src/Tes.Runner/Transfer/PartsProcessor.cs b/src/Tes.Runner/Transfer/PartsProcessor.cs index cb700f394..fe4ad4ab3 100644 --- a/src/Tes.Runner/Transfer/PartsProcessor.cs +++ b/src/Tes.Runner/Transfer/PartsProcessor.cs @@ -73,7 +73,7 @@ protected Task StartProcessorsWithScalingStrategyAsync(int numberOfProcessors, C if (!scalingStrategy.IsScalingAllowed(p, currentMaxPartProcessingTime)) { - logger.LogInformation("The maximum number of tasks for the transfer operation has been set. Max part processing time is: {currentMaxPartProcessingTimeInMs} ms. Processing tasks count: {processorCount}.", currentMaxPartProcessingTime, p); + logger.LogDebug("The maximum number of tasks for the transfer operation has been set. Max part processing time is: {currentMaxPartProcessingTimeInMs} ms. Processing tasks count: {processorCount}.", currentMaxPartProcessingTime, p); break; } } @@ -84,13 +84,13 @@ protected Task StartProcessorsWithScalingStrategyAsync(int numberOfProcessors, C if (readFromChannel.Reader.Completion.IsCompleted) { - logger.LogInformation("The readFromChannel is completed, no need to add more processing tasks. Processing tasks count: {processorCount}.", p); + logger.LogDebug("The readFromChannel is completed, no need to add more processing tasks. Processing tasks count: {processorCount}.", p); break; } var delay = scalingStrategy.GetScalingDelay(p); - logger.LogInformation("Increasing the number of processing tasks to {processorCount}", p + 1); + logger.LogDebug("Increasing the number of processing tasks to {processorCount}", p + 1); tasks.Add(StartProcessorTaskAsync(readFromChannel, processorAsync, cancellationSource)); diff --git a/src/Tes.Runner/Transfer/PartsProducer.cs b/src/Tes.Runner/Transfer/PartsProducer.cs index 9010a45e5..e4d741fa3 100644 --- a/src/Tes.Runner/Transfer/PartsProducer.cs +++ b/src/Tes.Runner/Transfer/PartsProducer.cs @@ -49,7 +49,7 @@ public async Task StartPartsProducersAsync(List blobOperation { await Task.WhenAll(partsProducerTasks); - logger.LogInformation("All parts from requested operations were created."); + logger.LogDebug("All parts from requested operations were created."); } catch (Exception e) { diff --git a/src/Tes.Runner/Transfer/PartsReader.cs b/src/Tes.Runner/Transfer/PartsReader.cs index af87fe119..1aaf554ae 100644 --- a/src/Tes.Runner/Transfer/PartsReader.cs +++ b/src/Tes.Runner/Transfer/PartsReader.cs @@ -53,6 +53,6 @@ async Task ReadPartAsync(PipelineBuffer buffer, CancellationToken cancellationTo writeBufferChannel.Writer.Complete(); } - logger.LogInformation("All part read operations completed successfully."); + logger.LogDebug("All part read operations completed successfully."); } } diff --git a/src/Tes.Runner/Transfer/PartsWriter.cs b/src/Tes.Runner/Transfer/PartsWriter.cs index 5931ba43a..c5de2fd71 100644 --- a/src/Tes.Runner/Transfer/PartsWriter.cs +++ b/src/Tes.Runner/Transfer/PartsWriter.cs @@ -53,7 +53,7 @@ async Task WritePartAsync(PipelineBuffer buffer, CancellationToken cancellationT processedBufferChannel.Writer.Complete(); } - logger.LogInformation("All part write operations completed successfully."); + logger.LogDebug("All part write operations completed successfully."); } private ProcessedBuffer ToProcessedBuffer(PipelineBuffer buffer) diff --git a/src/Tes.Runner/Transfer/ProcessedPartsProcessor.cs b/src/Tes.Runner/Transfer/ProcessedPartsProcessor.cs index 29cd85bdc..f19ed27d6 100644 --- a/src/Tes.Runner/Transfer/ProcessedPartsProcessor.cs +++ b/src/Tes.Runner/Transfer/ProcessedPartsProcessor.cs @@ -70,7 +70,7 @@ public async ValueTask StartProcessedPartsProcessorAsync(int expectedNumbe readBufferChannel.Writer.Complete(); } - logger.LogInformation("All parts were successfully processed."); + logger.LogDebug("All parts were successfully processed."); return totalBytes; } @@ -99,7 +99,7 @@ private async Task CompleteFileProcessingAsync(ProcessedBuffer buffer, Cancellat if (!cancellationTokenSource.IsCancellationRequested) { - logger.LogDebug("Cancelling tasks in the processed parts processor."); + logger.LogTrace("Cancelling tasks in the processed parts processor."); cancellationTokenSource.Cancel(); } throw; diff --git a/src/Tes.RunnerCLI/Commands/CommandHandlers.cs b/src/Tes.RunnerCLI/Commands/CommandHandlers.cs index f30f83586..d7858d614 100644 --- a/src/Tes.RunnerCLI/Commands/CommandHandlers.cs +++ b/src/Tes.RunnerCLI/Commands/CommandHandlers.cs @@ -132,7 +132,7 @@ internal static async Task ExecuteExecCommandAsync(Uri? fileUri, FileInfo? { var nodeTask = await nodeTaskUtils.ResolveNodeTaskAsync(file, fileUri, apiVersion); - Logger.LogDebug("Executing commands in container for Task ID: {NodeTaskId}", nodeTask.Id); + Logger.LogTrace("Executing commands in container for Task ID: {NodeTaskId}", nodeTask.Id); await using var executor = await Executor.CreateExecutorAsync(nodeTask, apiVersion); @@ -181,7 +181,7 @@ internal static async Task ExecuteUploadCommandAsync( string apiVersion) { - Logger.LogDebug("Starting upload operation."); + Logger.LogTrace("Starting upload operation."); var nodeTask = await nodeTaskUtils.ResolveNodeTaskAsync(file, fileUri, apiVersion); @@ -213,7 +213,7 @@ internal static async Task ExecuteDownloadCommandAsync( { var options = CommandLauncher.CreateBlobPipelineOptions(blockSize, writers, readers, bufferCapacity, apiVersion, setContentMd5OnUploads: false); - Logger.LogDebug("Starting download operation."); + Logger.LogTrace("Starting download operation."); var nodeTask = await nodeTaskUtils.ResolveNodeTaskAsync(file, fileUri, apiVersion); diff --git a/src/Tes.RunnerCLI/Commands/ProcessLauncher.cs b/src/Tes.RunnerCLI/Commands/ProcessLauncher.cs index a59974ce6..5c9b74e14 100644 --- a/src/Tes.RunnerCLI/Commands/ProcessLauncher.cs +++ b/src/Tes.RunnerCLI/Commands/ProcessLauncher.cs @@ -53,7 +53,7 @@ private async Task StartAndWaitForExitAsync(Process process) await logReader.WaitUntilAsync(timeout: TimeSpan.FromSeconds(LogWaitTimeoutInSeconds)); - logger.LogInformation("Process exited. Arguments: {ProcessArguments}", process.StartInfo.Arguments); + logger.LogDebug("Process exited. Arguments: {ProcessArguments}", process.StartInfo.Arguments); } private void SetupErrorAndOutputReaders(Process process) @@ -69,7 +69,7 @@ private void SetupProcessStartInfo(string[] options, Process process) process.StartInfo.RedirectStandardOutput = true; process.StartInfo.RedirectStandardError = true; - logger.LogInformation("Starting process: {ProcessFileName} {Process.Arguments}", process.StartInfo.FileName, process.StartInfo.Arguments); + logger.LogDebug("Starting process: {ProcessFileName} {Process.Arguments}", process.StartInfo.FileName, process.StartInfo.Arguments); } private static string? GetExecutableFullPath() diff --git a/src/TesApi.Web/AllowedVmSizesService.cs b/src/TesApi.Web/AllowedVmSizesService.cs index 65ae7746b..22f845b0b 100644 --- a/src/TesApi.Web/AllowedVmSizesService.cs +++ b/src/TesApi.Web/AllowedVmSizesService.cs @@ -39,7 +39,7 @@ private async Task GetAllowedVmSizesImpl(CancellationToken cancellationToken) { try { - logger.LogInformation("Executing allowed vm sizes config setup"); + logger.LogDebug("Executing allowed vm sizes config setup"); allowedVmSizes = await configUtils.ProcessAllowedVmSizesConfigurationFileAsync(cancellationToken); } catch (Exception e) diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 17511a07b..4f41efb8d 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -172,13 +172,13 @@ public async Task CreateBatchJobAsync(string jobId, string poolId, CancellationT { ArgumentException.ThrowIfNullOrEmpty(jobId); - logger.LogInformation("TES: Creating Batch job {BatchJob}", jobId); + logger.LogDebug("TES: Creating Batch job {BatchJob}", jobId); var job = batchClient.JobOperations.CreateJob(jobId, new() { PoolId = poolId }); job.OnAllTasksComplete = OnAllTasksComplete.NoAction; job.OnTaskFailure = OnTaskFailure.NoAction; await job.CommitAsync(cancellationToken: cancellationToken); - logger.LogInformation("TES: Batch job {BatchJob} committed successfully", jobId); + logger.LogDebug("TES: Batch job {BatchJob} committed successfully", jobId); await Task.Delay(TimeSpan.FromSeconds(2), cancellationToken); } @@ -200,7 +200,7 @@ public async Task AddBatchTasksAsync(IEnumerable cloudTasks, string j public async Task DeleteBatchJobAsync(string jobId, CancellationToken cancellationToken) { ArgumentException.ThrowIfNullOrEmpty(jobId); - logger.LogInformation("Deleting job {BatchJob}", jobId); + logger.LogDebug("Deleting job {BatchJob}", jobId); await batchClient.JobOperations.DeleteJobAsync(jobId, cancellationToken: cancellationToken); } @@ -232,7 +232,7 @@ public async Task TerminateBatchTaskAsync(string tesTaskId, string jobId, Cancel foreach (var task in batchTasksToTerminate) { - logger.LogInformation("Terminating task {BatchTask}", task.Id); + logger.LogDebug("Terminating task {BatchTask}", task.Id); await batchRetryPolicyWhenNodeNotReady.ExecuteWithRetryAsync(ct => task.TerminateAsync(cancellationToken: ct), cancellationToken); } } @@ -240,7 +240,7 @@ public async Task TerminateBatchTaskAsync(string tesTaskId, string jobId, Cancel /// public async Task DeleteBatchTaskAsync(string taskId, string jobId, CancellationToken cancellationToken) { - logger.LogInformation("Deleting task {BatchTask}", taskId); + logger.LogDebug("Deleting task {BatchTask}", taskId); await batchRetryPolicyWhenNodeNotReady.ExecuteWithRetryAsync(ct => batchClient.JobOperations.DeleteTaskAsync(jobId, taskId, cancellationToken: ct), cancellationToken); } diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 5aa84b3d0..fb795f3df 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -347,16 +347,16 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc switch (node.State) { case ComputeNodeState.Unusable: - _logger.LogDebug("Found unusable node {NodeId}", node.Id); + _logger.LogTrace("Found unusable node {NodeId}", node.Id); break; case ComputeNodeState.StartTaskFailed: - _logger.LogDebug("Found starttaskfailed node {NodeId}", node.Id); + _logger.LogTrace("Found starttaskfailed node {NodeId}", node.Id); StartTaskFailures.Enqueue(new(node.Id, node.StartTaskInformation.FailureInformation)); break; case ComputeNodeState.Preempted: - _logger.LogDebug("Found preempted node {NodeId}", node.Id); + _logger.LogTrace("Found preempted node {NodeId}", node.Id); break; default: @@ -559,7 +559,7 @@ ValueTask StandupQueries() // List tasks from batch just one time each time we service the pool when called from PoolScheduler _foundTasks.Clear(); _foundTasks.AddRange(GetTasksAsync("creationTime,executionInfo,id,nodeInfo,state,stateTransitionTime", null).ToBlockingEnumerable(cancellationToken)); - _logger.LogDebug("{PoolId}: {TaskCount} tasks discovered.", PoolId, _foundTasks.Count); + _logger.LogTrace("{PoolId}: {TaskCount} tasks discovered.", PoolId, _foundTasks.Count); // List nodes from Batch at most one time each time we service the pool _lazyComputeNodes = _taskPreviousComputeNodeIds.Count == 0 diff --git a/src/TesApi.Web/BatchPools.BatchScheduler.cs b/src/TesApi.Web/BatchPools.BatchScheduler.cs index 4f7dd8093..3bcdef492 100644 --- a/src/TesApi.Web/BatchPools.BatchScheduler.cs +++ b/src/TesApi.Web/BatchPools.BatchScheduler.cs @@ -209,7 +209,7 @@ public async ValueTask FlushPoolsAsync(IEnumerable assignedPools, Cancel public Task DeletePoolAndJobAsync(IBatchPool pool, CancellationToken cancellationToken) { // TODO: Consider moving any remaining tasks to another pool, or failing tasks explicitly - logger.LogDebug(@"Deleting pool and job {PoolId}", pool.PoolId); + logger.LogTrace(@"Deleting pool and job {PoolId}", pool.PoolId); return Task.WhenAll( AllowIfNotFound(batchPoolManager.DeleteBatchPoolAsync(pool.PoolId, cancellationToken)), diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 37fc54e81..8aa5b0730 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -380,7 +380,7 @@ bool HandlePreemptedNode(TesTask tesTask, CombinedBatchTaskInfo batchInfo) newLog.VirtualMachineInfo = oldLog.VirtualMachineInfo; newLog.StartTime = DateTimeOffset.UtcNow; tesTask.State = TesState.INITIALIZING; - logger.LogInformation("The TesTask {TesTask}'s node was preempted. It was automatically rescheduled.", tesTask.Id); + logger.LogDebug("The TesTask {TesTask}'s node was preempted. It was automatically rescheduled.", tesTask.Id); return true; } @@ -479,14 +479,14 @@ private async Task LoadExistingPoolsImplAsync(CancellationToken cancellationToke public async Task UploadTaskRunnerIfNeededAsync(CancellationToken cancellationToken) { var blobUri = await storageAccessProvider.GetInternalTesBlobUrlAsync(NodeTaskRunnerFilename, storageAccessProvider.BlobPermissionsWithWrite, cancellationToken); - logger.LogDebug(@"NodeTaskRunner Uri: {NodeTaskRunnerUri}", new BlobUriBuilder(blobUri) { Sas = null }.ToUri().AbsoluteUri); + logger.LogTrace(@"NodeTaskRunner Uri: {NodeTaskRunnerUri}", new BlobUriBuilder(blobUri) { Sas = null }.ToUri().AbsoluteUri); var blobProperties = await azureProxy.GetBlobPropertiesAsync(blobUri, cancellationToken); - logger.LogDebug(@"NodeTaskRunner MD5: {NodeTaskRunnerMD5}", Convert.ToBase64String(blobProperties?.ContentHash ?? [])); + logger.LogTrace(@"NodeTaskRunner MD5: {NodeTaskRunnerMD5}", Convert.ToBase64String(blobProperties?.ContentHash ?? [])); if (!runnerMD5.Equals(Convert.ToBase64String(blobProperties?.ContentHash ?? []), StringComparison.OrdinalIgnoreCase)) { - logger.LogInformation(@"Uploading NodeTaskRunner"); + logger.LogDebug(@"Uploading NodeTaskRunner"); await azureProxy.UploadBlobFromFileAsync(blobUri, $"scripts/{NodeTaskRunnerFilename}", cancellationToken); - logger.LogInformation(@"Uploaded NodeTaskRunner"); + logger.LogDebug(@"Uploaded NodeTaskRunner"); } } @@ -596,7 +596,7 @@ public async Task ProcessQueuedTesTaskAsync(TesTask tesTask, CancellationT // acrPullIdentity is special. Add it to the end of the list even if it is null, so it is always retrievable. identities.Add(await actionIdentityProvider.GetAcrPullActionIdentity(cancellationToken)); - logger.LogDebug(@"Checking quota for {TesTask}.", tesTask.Id); + logger.LogTrace(@"Checking quota for {TesTask}.", tesTask.Id); var virtualMachineInfo = await GetVmSizeAsync(tesTask, cancellationToken); virtualMachineInfo.Identities = identities; @@ -687,7 +687,7 @@ private async ValueTask AttachQueuedTesTaskToBatchPoolAsync(string poolKey var cloudTask = await ConvertTesTaskToBatchTaskUsingRunnerAsync(cloudTaskId, tesTask, virtualMachineInfo.Identities.Last(), virtualMachineInfo.VM.VmFamily, cancellationToken); _ = pool.AssociatedTesTasks.AddOrUpdate(tesTask.Id, key => cloudTask.Id, (key, value) => cloudTask.Id); - logger.LogInformation(@"Creating batch task for TES task {TesTaskId}. Using VM size {VmSize}.", tesTask.Id, virtualMachineInfo.VM.VmSize); + logger.LogDebug(@"Creating batch task for TES task {TesTaskId}. Using VM size {VmSize}.", tesTask.Id, virtualMachineInfo.VM.VmSize); AddTValueToCollectorQueue( key: pool.PoolId, @@ -802,7 +802,7 @@ public async ValueTask PerformShortBackgroundTasksAsync(CancellationToken cancel if (_queuedTesTaskPendingJobBatches.TryDequeue(out var jobBatch)) { var (jobId, tasks) = jobBatch; - logger.LogDebug(@"Adding {AddedTasks} tasks to {CloudJob}.", tasks.Count, jobId); + logger.LogTrace(@"Adding {AddedTasks} tasks to {CloudJob}.", tasks.Count, jobId); await PerformTaskAsync( method: async token => await azureProxy.AddBatchTasksAsync(tasks.Select(t => t.CloudTask), jobId, token), taskCompletions: tasks.Select(task => task.TaskCompletion), @@ -907,7 +907,7 @@ public IAsyncEnumerable PerformLongBackgroundTasksAsync(CancellationToken method: async token => { // This will remove pool keys we cannot accommodate due to quota, along with all of their associated tasks, from being queued into Batch. - logger.LogDebug(@"Checking pools and jobs quota to accommodate {NeededPools} additional pools.", poolsByKey.Count); + logger.LogTrace(@"Checking pools and jobs quota to accommodate {NeededPools} additional pools.", poolsByKey.Count); var (exceededQuantity, exception) = await quotaVerifier.CheckBatchAccountPoolAndJobQuotasAsync(poolsByKey.Count, token); @@ -929,7 +929,7 @@ bool TryRemovePool(string key, out IEnumerable result) return true; } - logger.LogDebug(@"Obtaining {NewPools} batch pools.", poolsByKey.Count); + logger.LogTrace(@"Obtaining {NewPools} batch pools.", poolsByKey.Count); foreach (var poolToCreate in poolsByKey.Values.SelectMany(pools => pools)) { diff --git a/src/TesApi.Web/Controllers/TaskServiceApi.cs b/src/TesApi.Web/Controllers/TaskServiceApi.cs index da82a6c69..55894d464 100644 --- a/src/TesApi.Web/Controllers/TaskServiceApi.cs +++ b/src/TesApi.Web/Controllers/TaskServiceApi.cs @@ -89,7 +89,7 @@ public virtual async Task CancelTaskAsync([FromRoute][Required] s } else if (tesTask.State != TesState.CANCELED) { - logger.LogInformation("Canceling task"); + logger.LogInformation("Canceling task {TesTask}", id); tesTask.State = TesState.CANCELING; try @@ -281,7 +281,7 @@ public virtual async Task CreateTaskAsync([FromBody] TesTask tesT [SwaggerResponse(statusCode: 200, type: typeof(TesServiceInfo), description: "")] public virtual IActionResult GetServiceInfo() { - logger.LogInformation("Id: {ServiceInfoId} Name: {ServiceInfoName} Type: {ServiceInfoType} Description: {ServiceInfoDescription} Organization: {ServiceInfoOrganization} ContactUrl: {ServiceInfoContactUrl} DocumentationUrl: {ServiceInfoDocumentationUrl} CreatedAt:{ServiceInfoCreatedAt} UpdatedAt:{ServiceInfoUpdatedAt} Environment: {ServiceInfoEnvironment} Version: {ServiceInfoVersion} Storage: {ServiceInfoStorage} TesResourcesSupportedBackendParameters: {ServiceInfoTesResourcesSupportedBackendParameters}", + logger.LogDebug("Id: {ServiceInfoId} Name: {ServiceInfoName} Type: {ServiceInfoType} Description: {ServiceInfoDescription} Organization: {ServiceInfoOrganization} ContactUrl: {ServiceInfoContactUrl} DocumentationUrl: {ServiceInfoDocumentationUrl} CreatedAt:{ServiceInfoCreatedAt} UpdatedAt:{ServiceInfoUpdatedAt} Environment: {ServiceInfoEnvironment} Version: {ServiceInfoVersion} Storage: {ServiceInfoStorage} TesResourcesSupportedBackendParameters: {ServiceInfoTesResourcesSupportedBackendParameters}", serviceInfo.Id, serviceInfo.Name, serviceInfo.Type, serviceInfo.Description, serviceInfo.Organization, serviceInfo.ContactUrl, serviceInfo.DocumentationUrl, serviceInfo.CreatedAt, serviceInfo.UpdatedAt, serviceInfo.Environment, serviceInfo.Version, string.Join(",", serviceInfo.Storage ?? []), string.Join(",", serviceInfo.TesResourcesSupportedBackendParameters ?? [])); return StatusCode(200, serviceInfo); } diff --git a/src/TesApi.Web/Exceptions/OperationCancelledExceptionFilter.cs b/src/TesApi.Web/Exceptions/OperationCancelledExceptionFilter.cs index a623e67fe..fbc0fbb99 100644 --- a/src/TesApi.Web/Exceptions/OperationCancelledExceptionFilter.cs +++ b/src/TesApi.Web/Exceptions/OperationCancelledExceptionFilter.cs @@ -28,7 +28,7 @@ public override void OnException(ExceptionContext context) { if (context.Exception is OperationCanceledException) { - _logger.LogInformation(context.Exception, "Request was cancelled"); + _logger.LogDebug(context.Exception, "Request was cancelled"); // TODO: 503? If so, how to add "Retry-After" and ensure that caching-related headers are not enabling caching of this result? // Alternatively: 429? Microsoft.AspNetCore.Http.StatusCodes.Status429TooManyRequests // If we are overloaded we do want to signal readiness for a retry, but this may also signal the service being shutdown. diff --git a/src/TesApi.Web/Management/ArmBatchQuotaProvider.cs b/src/TesApi.Web/Management/ArmBatchQuotaProvider.cs index b32df2e54..e3569aa57 100644 --- a/src/TesApi.Web/Management/ArmBatchQuotaProvider.cs +++ b/src/TesApi.Web/Management/ArmBatchQuotaProvider.cs @@ -95,7 +95,7 @@ private async Task GetBatchAccountQuotasImplAsync(Cance { try { - logger.LogInformation($"Getting quota information for Batch Account: {clientsFactory.BatchAccountInformation.Name} calling ARM API"); + logger.LogDebug($"Getting quota information for Batch Account: {clientsFactory.BatchAccountInformation.Name} calling ARM API"); var managementClient = clientsFactory.CreateBatchAccountManagementClient(); var batchAccount = (await managementClient.GetAsync(cancellationToken: cancellationToken)).Value.Data; diff --git a/src/TesApi.Web/Management/Batch/ArmBatchPoolManager.cs b/src/TesApi.Web/Management/Batch/ArmBatchPoolManager.cs index 3dc9041c9..93815899f 100644 --- a/src/TesApi.Web/Management/Batch/ArmBatchPoolManager.cs +++ b/src/TesApi.Web/Management/Batch/ArmBatchPoolManager.cs @@ -45,11 +45,11 @@ public async Task CreateBatchPoolAsync(BatchAccountPoolData poolSpec, bo var batchManagementClient = azureClientsFactory.CreateBatchAccountManagementClient(); - logger.LogInformation("Creating batch pool named {PoolName} with vmSize {PoolVmSize} and low priority {IsPreemptable}", nameItem.Value, poolSpec.VmSize, isPreemptable); + logger.LogDebug("Creating batch pool named {PoolName} with vmSize {PoolVmSize} and low priority {IsPreemptable}", nameItem.Value, poolSpec.VmSize, isPreemptable); _ = await batchManagementClient.GetBatchAccountPools().CreateOrUpdateAsync(Azure.WaitUntil.Completed, nameItem.Value, poolSpec, cancellationToken: cancellationToken); - logger.LogInformation("Successfully created batch pool named {PoolName} with vmSize {PoolVmSize} and low priority {IsPreemptable}", nameItem.Value, poolSpec.VmSize, isPreemptable); + logger.LogDebug("Successfully created batch pool named {PoolName} with vmSize {PoolVmSize} and low priority {IsPreemptable}", nameItem.Value, poolSpec.VmSize, isPreemptable); return nameItem.Value; } @@ -68,13 +68,13 @@ public async Task DeleteBatchPoolAsync(string poolId, CancellationToken cancella { var batchManagementClient = azureClientsFactory.CreateBatchAccountManagementClient(); - logger.LogInformation( + logger.LogDebug( @"Deleting pool with the id/name:{PoolName} in Batch account:{BatchAccountName}", poolId, azureClientsFactory.BatchAccountInformation.Name); _ = await batchManagementClient.GetBatchAccountPools().Get(poolId, cancellationToken: cancellationToken).Value .DeleteAsync(Azure.WaitUntil.Completed, cancellationToken); - logger.LogInformation( + logger.LogDebug( @"Successfully deleted pool with the id/name:{PoolName} in Batch account:{BatchAccountName}", poolId, azureClientsFactory.BatchAccountInformation.Name); } catch (Exception exc) diff --git a/src/TesApi.Web/Management/Batch/PoolMetadataReader.cs b/src/TesApi.Web/Management/Batch/PoolMetadataReader.cs index 5109009a7..8299e2541 100644 --- a/src/TesApi.Web/Management/Batch/PoolMetadataReader.cs +++ b/src/TesApi.Web/Management/Batch/PoolMetadataReader.cs @@ -57,7 +57,7 @@ public PoolMetadataReader(IOptions terraOptions, IAzureProxy azure /// When pool is not found public virtual async ValueTask GetMetadataValueAsync(string poolId, string key, CancellationToken cancellationToken) { - logger.LogInformation(@"Getting metadata from pool {PoolId}. Key {MetadataKey}", poolId, key); + logger.LogDebug(@"Getting metadata from pool {PoolId}. Key {MetadataKey}", poolId, key); var poolMetadata = (await azureProxy.GetBatchPoolAsync(poolId, cancellationToken: cancellationToken, new ODATADetailLevel { SelectClause = "metadata" }))?.Metadata; diff --git a/src/TesApi.Web/Management/Batch/TerraBatchPoolManager.cs b/src/TesApi.Web/Management/Batch/TerraBatchPoolManager.cs index 8d9a50cb1..2cc7742a1 100644 --- a/src/TesApi.Web/Management/Batch/TerraBatchPoolManager.cs +++ b/src/TesApi.Web/Management/Batch/TerraBatchPoolManager.cs @@ -127,14 +127,14 @@ public async Task DeleteBatchPoolAsync(string poolId, CancellationToken cancella try { - logger.LogInformation( + logger.LogDebug( "Deleting pool with the ID/name: {PoolId}", poolId); var wsmResourceId = await GetWsmResourceIdFromBatchPoolMetadataAsync(poolId, cancellationToken); await terraWsmApiClient.Value.DeleteBatchPoolAsync(Guid.Parse(terraOptions.WorkspaceId), wsmResourceId, cancellationToken); - logger.LogInformation( + logger.LogDebug( "Successfully deleted pool with the ID/name via WSM: {PoolId}", poolId); } catch (Exception e) diff --git a/src/TesApi.Web/Management/PriceApiBatchSkuInformationProvider.cs b/src/TesApi.Web/Management/PriceApiBatchSkuInformationProvider.cs index e5c2038c0..890f08304 100644 --- a/src/TesApi.Web/Management/PriceApiBatchSkuInformationProvider.cs +++ b/src/TesApi.Web/Management/PriceApiBatchSkuInformationProvider.cs @@ -85,7 +85,7 @@ public async Task> GetVmSizesAndPricesAsync(stri return await GetVmSizesAndPricesImplAsync(region, cancellationToken); } - logger.LogInformation("Trying to get pricing information from the cache for region: {Region}.", region); + logger.LogDebug("Trying to get pricing information from the cache for region: {Region}.", region); return await appCache.GetOrCreateAsync(VmSizesAndPricesKey(region), async entry => { entry.AbsoluteExpirationRelativeToNow = TimeSpan.FromDays(1); return await GetVmSizesAndPricesImplAsync(region, cancellationToken); }); } @@ -103,26 +103,26 @@ public async Task> GetStorageDisksAndPricesAsync(string region return await GetStorageDisksAndPricesImplAsync(region, capacity, maxDataDiskCount, cancellationToken); } - logger.LogInformation("Trying to get pricing information from the cache for region: {Region}.", region); + logger.LogDebug("Trying to get pricing information from the cache for region: {Region}.", region); return await appCache.GetOrCreateAsync(StorageDisksAndPricesKey(region, capacity, maxDataDiskCount), async entry => { entry.AbsoluteExpirationRelativeToNow = TimeSpan.FromDays(1); return await GetStorageDisksAndPricesImplAsync(region, capacity, maxDataDiskCount, cancellationToken); }); } private async Task> GetVmSizesAndPricesImplAsync(string region, CancellationToken cancellationToken) { - logger.LogInformation("Getting VM sizes and price information for region: {Region}", region); + logger.LogDebug("Getting VM sizes and price information for region: {Region}", region); var localVmSizeInfoForBatchSupportedSkus = (await GetLocalDataAsync($"BatchSupportedVmSizeInformation_{azureCloudConfig.Name.ToUpperInvariant()}.json", "Reading local VM size information from file: {LocalVmPriceList}", cancellationToken)) .Where(x => x.RegionsAvailable.Contains(region, StringComparer.OrdinalIgnoreCase)) .ToList(); - logger.LogInformation("localVmSizeInfoForBatchSupportedSkus.Count: {CountOfPrepreparedSkuRecordsInRegion}", localVmSizeInfoForBatchSupportedSkus.Count); + logger.LogDebug("localVmSizeInfoForBatchSupportedSkus.Count: {CountOfPrepreparedSkuRecordsInRegion}", localVmSizeInfoForBatchSupportedSkus.Count); try { var pricingItems = await priceApiClient.GetAllPricingInformationForNonWindowsAndNonSpotVmsAsync(region, cancellationToken).ToListAsync(cancellationToken); - logger.LogInformation("Received {CountOfSkuPrice} SKU pricing items", pricingItems.Count); + logger.LogDebug("Received {CountOfSkuPrice} SKU pricing items", pricingItems.Count); if (pricingItems == null || pricingItems.Count == 0) { @@ -178,16 +178,16 @@ private static VirtualMachineInformation CreateVirtualMachineInfoFromReference( private async Task> GetStorageDisksAndPricesImplAsync(string region, double capacity, int maxDataDiskCount, CancellationToken cancellationToken) { - logger.LogInformation("Getting VM sizes and price information for region: {Region}", region); + logger.LogDebug("Getting VM sizes and price information for region: {Region}", region); var localStorageDisksAndPrices = (await GetLocalDataAsync("BatchDataDiskInformation.json", "Reading local storage disk information from file: {LocalStoragePriceList}", cancellationToken)).Where(disk => disk.CapacityInGiB > 0).ToList(); - logger.LogInformation("localStorageDisksAndPrices.Count: {CountOfLocalStorageDisks}", localStorageDisksAndPrices.Count); + logger.LogDebug("localStorageDisksAndPrices.Count: {CountOfLocalStorageDisks}", localStorageDisksAndPrices.Count); try { var pricingItems = await (appCache?.GetOrCreateAsync(StorageDisksAndPricesKey(region), async entry => { entry.AbsoluteExpirationRelativeToNow = TimeSpan.FromDays(1); return await GetPricingData(priceApiClient, region, cancellationToken); }) ?? GetPricingData(priceApiClient, region, cancellationToken)); - logger.LogInformation("Received {CountOfDataDiskPrice} Storage pricing items", pricingItems.Count); + logger.LogDebug("Received {CountOfDataDiskPrice} Storage pricing items", pricingItems.Count); if (pricingItems.Count == 0) { @@ -249,7 +249,7 @@ private async Task> GetLocalDataAsync(string fileName, string logMess { var filePath = Path.Combine(AppContext.BaseDirectory, fileName); #pragma warning disable CA2254 // Template should be a static expression - logger.LogInformation(logMessage, filePath); + logger.LogDebug(logMessage, filePath); #pragma warning restore CA2254 // Template should be a static expression return JsonConvert.DeserializeObject>( diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index 0ebc7b155..4a7030837 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -216,7 +216,7 @@ await Parallel.ForEachAsync(requeues, cancellationToken, async (id, token) => } else { - Logger.LogDebug("No task state changes from pool/node information this time: PoolId: {PoolId}.", poolId); + Logger.LogTrace("No task state changes from pool/node information this time: PoolId: {PoolId}.", poolId); } } @@ -235,11 +235,11 @@ private async ValueTask ProcessTasksToDelete(IAsyncEnumerable GetCloudTaskStatesAsync( .Where(task => node.Id.Equals(task.PreviousComputeNodeId, StringComparison.InvariantCultureIgnoreCase)) .Select(task => task.CloudTask)) { - Logger.LogDebug("{TaskId} connected to node {NodeId} in state {NodeState}.", task.Id, node.Id, node.State); + Logger.LogTrace("{TaskId} connected to node {NodeId} in state {NodeState}.", task.Id, node.Id, node.State); yield return new(task.Id, node.State switch { @@ -292,7 +292,7 @@ private async IAsyncEnumerable GetCloudTaskStatesAsync( _ => throw new System.Diagnostics.UnreachableException(), }); - Logger.LogDebug("Removing {TaskId} from consideration for other errors.", task.Id); + Logger.LogTrace("Removing {TaskId} from consideration for other errors.", task.Id); _ = activeTaskList.Remove(task); } } @@ -363,7 +363,7 @@ IBatchPool.StartTaskFailureInformation PopNextStartTaskFailure() AzureBatchTaskState GetCompletedBatchState(CloudTask task) { - Logger.LogDebug("Getting batch task state from completed task {TesTask}.", BatchScheduler.GetTesTaskIdFromCloudTaskId(task.Id)); + Logger.LogTrace("Getting batch task state from completed task {TesTask}.", BatchScheduler.GetTesTaskIdFromCloudTaskId(task.Id)); return task.ExecutionInformation.Result switch { TaskExecutionResult.Success => new( diff --git a/src/TesApi.Web/Program.cs b/src/TesApi.Web/Program.cs index 7c0035a94..f7fb3d8a5 100644 --- a/src/TesApi.Web/Program.cs +++ b/src/TesApi.Web/Program.cs @@ -109,6 +109,14 @@ static ApplicationInsightsOptions GetApplicationInsightsConnectionString(IConfig { if (context.HostingEnvironment.IsProduction()) { + // This configures Container logging in AKS + logging.AddSimpleConsole(options => + { + options.IncludeScopes = true; + options.SingleLine = true; + options.UseUtcTimestamp = true; + }); + logging.AddConsole(options => options.LogToStandardErrorThreshold = LogLevel.Warning); if (!string.IsNullOrEmpty(applicationInsightsOptions?.ConnectionString)) { diff --git a/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs b/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs index ef8a65ffb..631b1a621 100644 --- a/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs +++ b/src/TesApi.Web/Runner/TaskExecutionScriptingManager.cs @@ -112,14 +112,14 @@ public string ParseBatchRunCommand(BatchScriptAssetsInfo batchScriptAssets) { var batchRunCommand = $"/usr/bin/env -S \"{BatchScheduler.BatchNodeSharedEnvVar}/{BatchScheduler.NodeTaskRunnerFilename} -i '{(new Azure.Storage.Blobs.BlobUriBuilder(batchScriptAssets.NodeTaskUrl) { Sas = null }).ToUri().AbsoluteUri}'\""; - logger.LogInformation("Run command: {RunCommand}", batchRunCommand); + logger.LogDebug("Run command: {RunCommand}", batchRunCommand); return batchRunCommand; } private async Task CreateAndUploadNodeTaskAsync(TesTask tesTask, NodeTaskConversionOptions nodeTaskConversionOptions, CancellationToken cancellationToken) { - logger.LogInformation("Creating and uploading node task definition file for Task ID: {TesTask}", tesTask.Id); + logger.LogDebug("Creating and uploading node task definition file for Task ID: {TesTask}", tesTask.Id); var nodeTask = await taskToNodeConverter.ToNodeTaskAsync(tesTask, nodeTaskConversionOptions, cancellationToken); @@ -127,7 +127,7 @@ private async Task CreateAndUploadNodeTaskAsync(TesTask tesTask, NodeTaskCo var nodeTaskUrl = await UploadContentAsBlobToInternalTesLocationAsync(tesTask, nodeTaskContent, NodeTaskFilename, cancellationToken); - logger.LogInformation("Successfully created and uploaded node task definition file for Task ID: {TesTask}", tesTask.Id); + logger.LogDebug("Successfully created and uploaded node task definition file for Task ID: {TesTask}", tesTask.Id); return nodeTaskUrl; } diff --git a/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs b/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs index dd3b6a838..7d77f6ff2 100644 --- a/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs +++ b/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs @@ -91,7 +91,7 @@ public virtual NodeTaskResolverOptions ToNodeTaskResolverOptions(TesTask task, N if (terraOptions is not null && !string.IsNullOrEmpty(terraOptions.WsmApiHost)) { - logger.LogInformation("Setting up Terra as the runtime environment for the runner"); + logger.LogDebug("Setting up Terra as the runtime environment for the runner"); builder.WithTerraAsRuntimeEnvironment(terraOptions.WsmApiHost, terraOptions.LandingZoneApiHost, terraOptions.SasAllowedIpRange); } @@ -134,7 +134,7 @@ public virtual async Task ToNodeTaskAsync(TesTask task, NodeTaskConver if (terraOptions is not null && !string.IsNullOrEmpty(terraOptions.WsmApiHost)) { - logger.LogInformation("Setting up Terra as the runtime environment for the runner"); + logger.LogDebug("Setting up Terra as the runtime environment for the runner"); builder.WithTerraAsRuntimeEnvironment(terraOptions.WsmApiHost, terraOptions.LandingZoneApiHost, terraOptions.SasAllowedIpRange); } @@ -197,7 +197,7 @@ private void BuildOutputs(TesTask task, string defaultStorageAccount, NodeTaskBu { if (task.Outputs is not null) { - logger.LogInformation(@"Mapping {TaskOutputsCount} outputs", task.Outputs.Count); + logger.LogDebug(@"Mapping {TaskOutputsCount} outputs", task.Outputs.Count); var outputs = PrepareLocalOutputsForMapping(task, defaultStorageAccount); @@ -251,7 +251,7 @@ private async Task BuildInputsAsync(TesTask task, NodeTaskBuilder builder, IList { if (task.Inputs is not null || additionalInputs is not null) { - logger.LogInformation($"Mapping inputs"); + logger.LogDebug($"Mapping inputs"); var inputs = await PrepareInputsForMappingAsync(task, defaultStorageAccount, cancellationToken); @@ -284,7 +284,7 @@ private async Task> PrepareInputsForMappingAsync(TesTask tesTask, { var key = $"{input.Path}{input.Url}"; - logger.LogInformation(@"Preparing input {InputPath}", input.Path); + logger.LogDebug(@"Preparing input {InputPath}", input.Path); if (input.Streamable == true) // Don't download files where localization_optional is set to true in WDL (corresponds to "Streamable" property being true on TesInput) { @@ -301,7 +301,7 @@ private async Task> PrepareInputsForMappingAsync(TesTask tesTask, if (preparedInput != null) { - logger.LogInformation(@"Input {InputPath} is a content input", input.Path); + logger.LogDebug(@"Input {InputPath} is a content input", input.Path); inputs.Add(key, preparedInput); continue; } @@ -312,7 +312,7 @@ private async Task> PrepareInputsForMappingAsync(TesTask tesTask, if (preparedInput != null) { - logger.LogInformation(@"Input {InputPath} is a local input", input.Path); + logger.LogDebug(@"Input {InputPath} is a local input", input.Path); inputs.Add(key, preparedInput); continue; @@ -322,13 +322,13 @@ private async Task> PrepareInputsForMappingAsync(TesTask tesTask, if (preparedInput != null) { - logger.LogInformation(@"Input {InputPath} is an external storage account input", input.Path); + logger.LogDebug(@"Input {InputPath} is an external storage account input", input.Path); inputs.Add(key, preparedInput); continue; } - logger.LogInformation(@"Input {InputPath} is a regular input", input.Path); + logger.LogDebug(@"Input {InputPath} is a regular input", input.Path); inputs.Add(key, input); } @@ -503,7 +503,7 @@ await storageAccessProvider.GetInternalTesTaskBlobUrlAsync(tesTask, Guid.NewGuid var inputUrl = StorageUrlUtils.RemoveQueryStringFromUrl(inputFileUrl); - logger.LogInformation(@"Successfully uploaded content input as a new blob at: {InputUrl}", inputUrl); + logger.LogDebug(@"Successfully uploaded content input as a new blob at: {InputUrl}", inputUrl); return new TesInput { @@ -521,7 +521,7 @@ private async Task PrepareContentInputAsync(TesTask tesTask, TesInput return default; } - logger.LogInformation(@"The input is content. Uploading its content to the internal storage location. Input path:{InputPath}", input.Path); + logger.LogDebug(@"The input is content. Uploading its content to the internal storage location. Input path:{InputPath}", input.Path); if (input.Type == TesFileType.DIRECTORY) { diff --git a/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs b/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs index a03bcb1de..ab524886b 100644 --- a/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/TerraStorageAccessProvider.cs @@ -241,11 +241,11 @@ private async Task GetTerraBlobInfoFromContainerNameAsync(string CheckIfAccountIsTerraStorageAccount(segments.AccountName); - Logger.LogInformation($"Getting Workspace ID from the Container Name: {segments.ContainerName}"); + Logger.LogDebug($"Getting Workspace ID from the Container Name: {segments.ContainerName}"); var workspaceId = ToWorkspaceId(segments.ContainerName); - Logger.LogInformation($"Workspace ID to use: {segments.ContainerName}"); + Logger.LogDebug($"Workspace ID to use: {segments.ContainerName}"); var wsmContainerResourceId = await GetWsmContainerResourceIdAsync(workspaceId, segments.ContainerName, cancellationToken); @@ -254,7 +254,7 @@ private async Task GetTerraBlobInfoFromContainerNameAsync(string private async Task GetWsmContainerResourceIdAsync(Guid workspaceId, string containerName, CancellationToken cancellationToken) { - Logger.LogInformation($"Getting container resource information from WSM. Workspace ID: {workspaceId} Container Name: {containerName}"); + Logger.LogDebug($"Getting container resource information from WSM. Workspace ID: {workspaceId} Container Name: {containerName}"); try { @@ -266,7 +266,7 @@ private async Task GetWsmContainerResourceIdAsync(Guid workspaceId, string r.ResourceAttributes.AzureStorageContainer.StorageContainerName.Equals(containerName, StringComparison.OrdinalIgnoreCase)).Metadata; - Logger.LogInformation($"Found the resource id for storage container resource. Resource ID: {metadata.ResourceId} Container Name: {containerName}"); + Logger.LogDebug($"Found the resource id for storage container resource. Resource ID: {metadata.ResourceId} Container Name: {containerName}"); return Guid.Parse(metadata.ResourceId); } @@ -325,7 +325,7 @@ internal async Task GetMappedSasUrlFromWsmAsync(TerraBlobInfo blobInfo, boo { var tokenInfo = await GetWorkspaceBlobSasTokenFromWsmAsync(blobInfo, needsTags, cancellationToken); - Logger.LogInformation($"Successfully obtained the Sas Url from Terra. Wsm resource id:{terraOptions.WorkspaceStorageContainerResourceId}"); + Logger.LogDebug($"Successfully obtained the Sas Url from Terra. Wsm resource id:{terraOptions.WorkspaceStorageContainerResourceId}"); var uriBuilder = new UriBuilder(tokenInfo.Url); @@ -351,7 +351,7 @@ private async Task GetWorkspaceBlobSasTokenFromWsmAsync( { var tokenParams = CreateTokenParamsFromOptions(blobInfo.BlobName, SasBlobPermissions + (needsTags.GetValueOrDefault() ? "t" : string.Empty)); - Logger.LogInformation( + Logger.LogDebug( $"Getting Sas Url from Terra. Wsm workspace id:{blobInfo.WorkspaceId}"); return await terraWsmApiClient.Value.GetSasTokenAsync( @@ -365,7 +365,7 @@ private async Task GetWorkspaceContainerSasTokenFromWsmA // an empty blob name gets a container Sas token var tokenParams = CreateTokenParamsFromOptions(blobName: "", SasContainerPermissions + (needsTags.GetValueOrDefault() ? "t" : string.Empty)); - Logger.LogInformation( + Logger.LogDebug( $"Getting Sas container Url from Terra. Wsm workspace id:{blobInfo.WorkspaceId}"); return await terraWsmApiClient.Value.GetSasTokenAsync( diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index d2bc8658f..14859a3b9 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -92,7 +92,7 @@ protected override async ValueTask ExecuteSetupAsync(CancellationToken cancellat return; } - Logger.LogDebug(@"Querying active tasks"); + Logger.LogTrace(@"Querying active tasks"); foreach (var tesTask in (await Repository.GetItemsAsync( @@ -104,7 +104,7 @@ protected override async ValueTask ExecuteSetupAsync(CancellationToken cancellat { if (TesState.QUEUED.Equals(tesTask.State) && string.IsNullOrWhiteSpace(tesTask.PoolId)) { - Logger.LogDebug(@"Adding queued task from repository"); + Logger.LogTrace(@"Adding queued task from repository"); queuedTesTasks.Enqueue(tesTask); } else @@ -118,7 +118,7 @@ protected override async ValueTask ExecuteSetupAsync(CancellationToken cancellat } else { - Logger.LogDebug(@"Adding task to pool w/o cloudtask"); + Logger.LogTrace(@"Adding task to pool w/o cloudtask"); _ = pool.AssociatedTesTasks.AddOrUpdate(tesTask.Id, key => null, (key, value) => value); } } @@ -129,7 +129,7 @@ protected override async ValueTask ExecuteSetupAsync(CancellationToken cancellat } } - Logger.LogDebug(@"Active tasks processed"); + Logger.LogTrace(@"Active tasks processed"); } /// @@ -159,7 +159,7 @@ protected override async ValueTask ExecuteCoreAsync(CancellationToken cancellati return; } - Logger.LogDebug(@"Task load: {TaskCount}", queuedTasks.Count); + Logger.LogTrace(@"Task load: {TaskCount}", queuedTasks.Count); await Task.WhenAll(queuedTasks); } @@ -375,7 +375,7 @@ async ValueTask GetTesTaskAsync(string id, string @event) TesTask tesTask = default; if (await Repository.TryGetItemAsync(id, token, task => tesTask = task) && tesTask is not null) { - Logger.LogDebug("Completing event '{TaskEvent}' for task {TesTask}.", @event, tesTask.Id); + Logger.LogTrace("Completing event '{TaskEvent}' for task {TesTask}.", @event, tesTask.Id); return tesTask; } else diff --git a/src/TesApi.Web/TerraActionIdentityProvider.cs b/src/TesApi.Web/TerraActionIdentityProvider.cs index 3778edbf9..06535c80c 100644 --- a/src/TesApi.Web/TerraActionIdentityProvider.cs +++ b/src/TesApi.Web/TerraActionIdentityProvider.cs @@ -51,12 +51,12 @@ public async Task GetAcrPullActionIdentity(CancellationToken cancellatio if (response is null) { // Corresponds to no identity existing in Sam, or the user not having access to it. - Logger.LogInformation(@"Found no ACR Pull action identity in Sam for {id}", samResourceIdForAcrPull); + Logger.LogDebug(@"Found no ACR Pull action identity in Sam for {id}", samResourceIdForAcrPull); return null; } else { - Logger.LogInformation(@"Successfully fetched ACR action identity from Sam: {ObjectId}", response.ObjectId); + Logger.LogDebug(@"Successfully fetched ACR action identity from Sam: {ObjectId}", response.ObjectId); return response.ObjectId; } } diff --git a/src/TesApi.Web/appsettings.json b/src/TesApi.Web/appsettings.json index 9b79b0c8c..2f1e87b30 100644 --- a/src/TesApi.Web/appsettings.json +++ b/src/TesApi.Web/appsettings.json @@ -7,11 +7,11 @@ "LogLevel": { "Azure": "Warning", "Microsoft": "Warning", - "TesApi.Web.AzureProxy": "Debug", - "TesApi.Web.BatchPool": "Debug", - "TesApi.Web.BatchScheduler": "Debug", - "TesApi.Web.PoolScheduler": "Debug", - "TesApi.Web.TaskScheduler": "Debug", + "TesApi.Web.AzureProxy": "Trace", + "TesApi.Web.BatchPool": "Trace", + "TesApi.Web.BatchScheduler": "Trace", + "TesApi.Web.PoolScheduler": "Trace", + "TesApi.Web.TaskScheduler": "Trace", "Default": "Information" } }, From b9b496e5d8ff1ab7c993706083d6cedba254a8e7 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 1 Nov 2024 17:36:28 -0700 Subject: [PATCH 166/202] Cleanup --- .../PostgreSqlCachingRepository.cs | 19 +++++++++++-------- .../TesTaskPostgreSqlRepository.cs | 1 - src/Tes/Models/TesTask.cs | 2 +- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/Tes.Repository/PostgreSqlCachingRepository.cs b/src/Tes.Repository/PostgreSqlCachingRepository.cs index 42fdc7ac4..de4ab7679 100644 --- a/src/Tes.Repository/PostgreSqlCachingRepository.cs +++ b/src/Tes.Repository/PostgreSqlCachingRepository.cs @@ -155,15 +155,18 @@ protected Task AddUpdateOrRemoveItemInDbAsync(TDbItem item, Func(); var result = source.Task; - if (updatingItems.TryAdd(item.Id, null)) + if (WriteAction.Add != action) { - result = source.Task.ContinueWith(RemoveUpdatingItem).Unwrap(); - } - else - { - throw new RepositoryCollisionException( - "Respository concurrency failure: attempt to update item with previously queued update pending.", - getItem(item)); + if (updatingItems.TryAdd(item.Id, null)) + { + result = source.Task.ContinueWith(RemoveUpdatingItem).Unwrap(); + } + else + { + throw new RepositoryCollisionException( + "Respository concurrency failure: attempt to update item with previously queued update pending.", + getItem(item)); + } } if (!itemsToWrite.Writer.TryWrite(new(item, action, source))) diff --git a/src/Tes.Repository/TesTaskPostgreSqlRepository.cs b/src/Tes.Repository/TesTaskPostgreSqlRepository.cs index c54c1cefe..8dfd1849e 100644 --- a/src/Tes.Repository/TesTaskPostgreSqlRepository.cs +++ b/src/Tes.Repository/TesTaskPostgreSqlRepository.cs @@ -140,7 +140,6 @@ public async Task TryGetItemAsync(string id, CancellationToken cancellatio /// public async Task> GetItemsAsync(Expression> predicate, CancellationToken cancellationToken) { - Logger.LogWarning(@"TesTaskPostgreSqlRepository::GetItemsAsync called"); // TODO: remove this log return (await InternalGetItemsAsync(cancellationToken, efPredicates: [predicate])).Select(t => t.TesTask); } diff --git a/src/Tes/Models/TesTask.cs b/src/Tes/Models/TesTask.cs index 37c46597d..95041ade3 100644 --- a/src/Tes/Models/TesTask.cs +++ b/src/Tes/Models/TesTask.cs @@ -120,7 +120,7 @@ public TesTask() /// Valid TES task ID public string CreateId() { - var tesTaskIdPrefix = WorkflowId is not null && Guid.TryParse(WorkflowId, out _) ? $"{WorkflowId[..8]}_" : string.Empty; + var tesTaskIdPrefix = WorkflowId is not null && Guid.TryParse(WorkflowId, out var guid) ? guid.ToString()[..8] + "_" : string.Empty; return $"{tesTaskIdPrefix}{Guid.NewGuid():N}"; } From 3b0baaa521135abeb35a4e42f8f2ff0deaafed9d Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 25 Nov 2024 16:26:05 -0800 Subject: [PATCH 167/202] Extend retry configuration --- src/TesApi.Web/Program.cs | 13 +++++++++++-- src/TesApi.Web/Startup.cs | 6 ++++++ .../Storage/DefaultStorageAccessProvider.cs | 8 ++++---- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/TesApi.Web/Program.cs b/src/TesApi.Web/Program.cs index f7fb3d8a5..303035197 100644 --- a/src/TesApi.Web/Program.cs +++ b/src/TesApi.Web/Program.cs @@ -8,6 +8,7 @@ using Azure.ResourceManager; using CommonUtilities; using CommonUtilities.AzureCloud; +using CommonUtilities.Options; using Microsoft.AspNetCore; using Microsoft.AspNetCore.Hosting; using Microsoft.Extensions.Configuration; @@ -160,10 +161,18 @@ static ApplicationInsightsOptions GetApplicationInsightsConnectionString(IConfig static AzureCloudConfig GetAzureCloudConfig(IConfiguration configuration) { - var tesOptions = new GeneralOptions(); + GeneralOptions tesOptions = new(); configuration.Bind(GeneralOptions.SectionName, tesOptions); Console.WriteLine($"tesOptions.AzureCloudName: {tesOptions.AzureCloudName}"); - return AzureCloudConfig.FromKnownCloudNameAsync(cloudName: tesOptions.AzureCloudName, azureCloudMetadataUrlApiVersion: tesOptions.AzureCloudMetadataUrlApiVersion).Result; + + RetryPolicyOptions retryPolicy = new(); + configuration.Bind(RetryPolicyOptions.SectionName, retryPolicy); + + return AzureCloudConfig.FromKnownCloudNameAsync( + cloudName: tesOptions.AzureCloudName, + azureCloudMetadataUrlApiVersion: tesOptions.AzureCloudMetadataUrlApiVersion, + retryPolicyOptions: Microsoft.Extensions.Options.Options.Create(retryPolicy)) + .Result; } } } diff --git a/src/TesApi.Web/Startup.cs b/src/TesApi.Web/Startup.cs index 331682c71..8414683a5 100644 --- a/src/TesApi.Web/Startup.cs +++ b/src/TesApi.Web/Startup.cs @@ -303,6 +303,12 @@ static string ParseStorageUri(string uri) .AddHostedService(sp => (AllowedVmSizesService)sp.GetRequiredService(typeof(IAllowedVmSizesService))) .AddHostedService() .AddHostedService(s => s.GetRequiredService()); + + RetryPolicyOptions retryPolicy = new(); + configuration.Bind(RetryPolicyOptions.SectionName, retryPolicy); + + ClientOptions.Default.Retry.MaxRetries = retryPolicy.MaxRetryCount; + ClientOptions.Default.Retry.Delay = TimeSpan.FromSeconds(retryPolicy.ExponentialBackOffExponent); } catch (Exception exc) { diff --git a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs index 170a64f8e..9097ce299 100644 --- a/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs +++ b/src/TesApi.Web/Storage/DefaultStorageAccessProvider.cs @@ -51,7 +51,7 @@ public DefaultStorageAccessProvider(ILogger logger } else { - logger.LogError($"Invalid value '{uri}' found in 'ExternalStorageContainers' configuration. Value must be a valid azure storage account or container URL."); + logger.LogError("Invalid value '{Url}' found in 'ExternalStorageContainers' configuration. Value must be a valid azure storage account or container URL.", uri); return null; } }) @@ -183,7 +183,7 @@ private async Task AddSasTokenAsyncImpl(StorageAccoun if (!await TryGetStorageAccountInfoAsync(pathSegments.AccountName, cancellationToken, info => storageAccountInfo = info)) { - Logger.LogError($"Could not find storage account '{pathSegments.AccountName}' corresponding to path '{path}'. Either the account does not exist or the TES app service does not have permission to it."); + Logger.LogError("Could not find storage account '{StorageAccount}' corresponding to path '{StoragePath}'. Either the account does not exist or the TES app service does not have permission to it.", pathSegments.AccountName, path); throw new InvalidOperationException($"Could not find storage account '{pathSegments.AccountName}' corresponding to path '{path}'."); } @@ -303,12 +303,12 @@ private async Task TryGetStorageAccountInfoAsync(string accountName, Cance } else { - Logger.LogError($"Could not find storage account '{accountName}'. Either the account does not exist or the TES app service does not have permission to it."); + Logger.LogError("Could not find storage account '{StorageAccountName}'. Either the account does not exist or the TES app service does not have permission to it.", accountName); } } catch (Exception ex) { - Logger.LogError(ex, $"Exception while getting storage account '{accountName}'"); + Logger.LogError(ex, "Exception while getting storage account '{StorageAccountName}'", accountName); } return false; From c1b75b76f91d4c2ea48b63e78e99218059fdc962 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 11 Dec 2024 09:53:15 -0800 Subject: [PATCH 168/202] Implement token retry from connection string --- ...AzureServicesConnectionStringCredential.cs | 147 +++++++++++++----- 1 file changed, 107 insertions(+), 40 deletions(-) diff --git a/src/CommonUtilities/AzureServicesConnectionStringCredential.cs b/src/CommonUtilities/AzureServicesConnectionStringCredential.cs index 8d8115e99..df7644043 100644 --- a/src/CommonUtilities/AzureServicesConnectionStringCredential.cs +++ b/src/CommonUtilities/AzureServicesConnectionStringCredential.cs @@ -113,6 +113,11 @@ private void SetInitialState(AzureCloudConfig armEndpoints) /// public bool DisableInstanceDiscovery { get; set; } + /// + /// Options controlling the storage of the token cache. + /// + public Azure.Identity.TokenCachePersistenceOptions TokenCachePersistenceOptions { get; set; } + /// /// Specifies tenants in addition to the specified for which the credential may acquire tokens. /// Add the wildcard value "*" to allow the credential to acquire tokens for any tenant the logged in account can access. @@ -134,23 +139,17 @@ private void SetInitialState(AzureCloudConfig armEndpoints) internal Azure.Identity.AzureCliCredential CreateAzureCliCredential() { - var result = new Azure.Identity.AzureCliCredentialOptions { TenantId = TenantId, AuthorityHost = AuthorityHost, IsUnsafeSupportLoggingEnabled = IsUnsafeSupportLoggingEnabled }; - CopyAdditionallyAllowedTenants(result.AdditionallyAllowedTenants); - return new(result); + return new(ConfigureOptions(new Azure.Identity.AzureCliCredentialOptions())); } internal Azure.Identity.VisualStudioCredential CreateVisualStudioCredential() { - var result = new Azure.Identity.VisualStudioCredentialOptions { TenantId = TenantId, AuthorityHost = AuthorityHost, IsUnsafeSupportLoggingEnabled = IsUnsafeSupportLoggingEnabled }; - CopyAdditionallyAllowedTenants(result.AdditionallyAllowedTenants); - return new(result); + return new(ConfigureOptions(new Azure.Identity.VisualStudioCredentialOptions())); } internal Azure.Identity.VisualStudioCodeCredential CreateVisualStudioCodeCredential() { - var result = new Azure.Identity.VisualStudioCodeCredentialOptions { TenantId = TenantId, AuthorityHost = AuthorityHost, IsUnsafeSupportLoggingEnabled = IsUnsafeSupportLoggingEnabled }; - CopyAdditionallyAllowedTenants(result.AdditionallyAllowedTenants); - return new(result); + return new(ConfigureOptions(new Azure.Identity.VisualStudioCodeCredentialOptions())); } //internal Azure.Identity.InteractiveBrowserCredential CreateInteractiveBrowserCredential() @@ -169,40 +168,112 @@ internal Azure.Identity.VisualStudioCodeCredential CreateVisualStudioCodeCredent internal Azure.Identity.ClientSecretCredential CreateClientSecretCredential(string appId, string appKey, string tenantId) { - var result = new Azure.Identity.ClientSecretCredentialOptions { AuthorityHost = AuthorityHost, IsUnsafeSupportLoggingEnabled = IsUnsafeSupportLoggingEnabled, DisableInstanceDiscovery = DisableInstanceDiscovery }; - CopyAdditionallyAllowedTenants(result.AdditionallyAllowedTenants); - return new(string.IsNullOrEmpty(tenantId) ? TenantId : tenantId, appId, appKey, result); + return new(string.IsNullOrEmpty(tenantId) ? TenantId : tenantId, appId, appKey, ConfigureOptions(new Azure.Identity.ClientSecretCredentialOptions())); } - internal Azure.Identity.ManagedIdentityCredential CreateManagedIdentityCredential(int _1, string appId) + internal Azure.Identity.ManagedIdentityCredential CreateManagedIdentityCredential(string appId) { - return new(appId, this); + return new(appId, options: this); } - internal Azure.Identity.ManagedIdentityCredential CreateManagedIdentityCredential(int _1) + internal Azure.Identity.ManagedIdentityCredential CreateManagedIdentityCredential() { - return new(options: this); + return CreateManagedIdentityCredential(null!); } internal Azure.Identity.WorkloadIdentityCredential CreateWorkloadIdentityCredential(string appId) { - Azure.Identity.WorkloadIdentityCredentialOptions result = new() { ClientId = appId, AuthorityHost = AuthorityHost, IsUnsafeSupportLoggingEnabled = IsUnsafeSupportLoggingEnabled, DisableInstanceDiscovery = DisableInstanceDiscovery, TenantId = TenantId }; - CopyAdditionallyAllowedTenants(result.AdditionallyAllowedTenants); - return new(result); + return new(ConfigureOptions(new Azure.Identity.WorkloadIdentityCredentialOptions())); } internal Azure.Identity.WorkloadIdentityCredential CreateWorkloadIdentityCredential() { - Azure.Identity.WorkloadIdentityCredentialOptions result = new() { AuthorityHost = AuthorityHost, IsUnsafeSupportLoggingEnabled = IsUnsafeSupportLoggingEnabled, DisableInstanceDiscovery = DisableInstanceDiscovery, TenantId = TenantId }; - CopyAdditionallyAllowedTenants(result.AdditionallyAllowedTenants); - return new(result); + return new(ConfigureOptions(new Azure.Identity.WorkloadIdentityCredentialOptions())); } - void CopyAdditionallyAllowedTenants(IList additionalTenants) + // Based on https://github.com/Azure/azure-sdk-for-net/blob/main/sdk/identity/Azure.Identity/src/Credentials/TokenCredentialOptions.cs#L50 method Clone + private T ConfigureOptions(T options) where T : Azure.Identity.TokenCredentialOptions { - foreach (var tenant in AdditionallyAllowedTenants) + CopyTenantId(options); + + // copy TokenCredentialOptions Properties + options.AuthorityHost = AuthorityHost; + + options.IsUnsafeSupportLoggingEnabled = IsUnsafeSupportLoggingEnabled; + + // copy TokenCredentialDiagnosticsOptions specific options + options.Diagnostics.IsAccountIdentifierLoggingEnabled = Diagnostics.IsAccountIdentifierLoggingEnabled; + + // copy ISupportsDisableInstanceDiscovery + CopyDisableInstanceDiscovery(options); + + // copy ISupportsTokenCachePersistenceOptions + CopyTokenCachePersistenceOptions(options); + + // copy ISupportsAdditionallyAllowedTenants + CopyAdditionallyAllowedTenants(options); + + // copy base ClientOptions properties + + // only copy transport if the original has changed from the default so as not to set IsCustomTransportSet unintentionally + if (Transport != Default.Transport) { - additionalTenants.Add(tenant); + options.Transport = Transport; + } + + // clone base Diagnostic options + options.Diagnostics.ApplicationId = Diagnostics.ApplicationId; + options.Diagnostics.IsLoggingEnabled = Diagnostics.IsLoggingEnabled; + options.Diagnostics.IsTelemetryEnabled = Diagnostics.IsTelemetryEnabled; + options.Diagnostics.LoggedContentSizeLimit = Diagnostics.LoggedContentSizeLimit; + options.Diagnostics.IsDistributedTracingEnabled = Diagnostics.IsDistributedTracingEnabled; + options.Diagnostics.IsLoggingContentEnabled = Diagnostics.IsLoggingContentEnabled; + + CopyListItems(Diagnostics.LoggedHeaderNames, options.Diagnostics.LoggedHeaderNames); + CopyListItems(Diagnostics.LoggedQueryParameters, options.Diagnostics.LoggedQueryParameters); + + // clone base RetryOptions + options.RetryPolicy = RetryPolicy; + + options.Retry.MaxRetries = Retry.MaxRetries; + options.Retry.Delay = Retry.Delay; + options.Retry.MaxDelay = Retry.MaxDelay; + options.Retry.Mode = Retry.Mode; + options.Retry.NetworkTimeout = Retry.NetworkTimeout; + + return options; + } + + private static void CopyListItems(IList source, IList destination) + { + foreach (var item in source) + { + destination.Add(item); + } + } + + private void CopyTenantId(T options) where T : Azure.Identity.TokenCredentialOptions + { + options?.GetType().GetProperty(nameof(TenantId))?.SetValue(options, TenantId); + } + + private void CopyDisableInstanceDiscovery(T options) where T : Azure.Identity.TokenCredentialOptions + { + options?.GetType().GetProperty(nameof(DisableInstanceDiscovery))?.SetValue(options, DisableInstanceDiscovery); + } + + private void CopyTokenCachePersistenceOptions(T options) where T : Azure.Identity.TokenCredentialOptions + { + options?.GetType().GetProperty(nameof(TokenCachePersistenceOptions))?.SetValue(options, TokenCachePersistenceOptions); + } + + void CopyAdditionallyAllowedTenants(T options) where T : Azure.Identity.TokenCredentialOptions + { + var additionalTenants = options?.GetType().GetProperty(nameof(AdditionallyAllowedTenants))?.GetValue(options) as IList; + + if (additionalTenants is not null) + { + CopyListItems(AdditionallyAllowedTenants, additionalTenants); } } } @@ -367,25 +438,18 @@ internal static TokenCredential Create(AzureServicesConnectionStringCredentialOp } else { - ValidateMsiRetryTimeout(connectionSettings, options.ConnectionString); + ValidateAndSetMsiRetryTimeout(connectionSettings, options); // If certificate or client secret are not specified, use the specified managed identity - azureServiceTokenCredential = options.CreateManagedIdentityCredential( - connectionSettings.TryGetValue(MsiRetryTimeout, out var value) - ? int.Parse(value) - : 0, - appId); + azureServiceTokenCredential = options.CreateManagedIdentityCredential(appId); } } else { - ValidateMsiRetryTimeout(connectionSettings, options.ConnectionString); + ValidateAndSetMsiRetryTimeout(connectionSettings, options); // If AppId is not specified, use Managed Service Identity - azureServiceTokenCredential = options.CreateManagedIdentityCredential( - connectionSettings.TryGetValue(MsiRetryTimeout, out var value) - ? int.Parse(value) - : 0); + azureServiceTokenCredential = options.CreateManagedIdentityCredential(); } } else if (string.Equals(runAs, Workload, StringComparison.OrdinalIgnoreCase)) @@ -468,7 +532,7 @@ private static void ValidateAttribute(Dictionary connectionSetti // } //} - private static void ValidateMsiRetryTimeout(Dictionary connectionSettings, string connectionString) + private static void ValidateAndSetMsiRetryTimeout(Dictionary connectionSettings, AzureServicesConnectionStringCredentialOptions options) { if (connectionSettings != null && connectionSettings.TryGetValue(MsiRetryTimeout, out var value)) { @@ -476,10 +540,13 @@ private static void ValidateMsiRetryTimeout(Dictionary connectio { var timeoutString = value; - var parseSucceeded = int.TryParse(timeoutString, out _); - if (!parseSucceeded) + if (int.TryParse(timeoutString, out var timeoutValue) && timeoutValue >= 0) + { + options.Retry.NetworkTimeout = TimeSpan.FromSeconds(timeoutValue); + } + else { - throw new ArgumentException($"Connection string '{connectionString}' is not valid. MsiRetryTimeout '{timeoutString}' is not valid. Valid values are integers greater than or equal to 0.", nameof(connectionString)); + throw new ArgumentException($"Connection string '{options.ConnectionString}' is not valid. MsiRetryTimeout '{timeoutString}' is not valid. Valid values are integers greater than or equal to 0.", nameof(options)); } } } From 6e3b5fd7e833a0e8856a7d7560eb74b961a2a79e Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 11 Dec 2024 10:21:27 -0800 Subject: [PATCH 169/202] Address flaky unit tests --- .../ArmEnvironmentEndpointsTests.cs | 27 ++++++++++++++----- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/CommonUtilities.Tests/ArmEnvironmentEndpointsTests.cs b/src/CommonUtilities.Tests/ArmEnvironmentEndpointsTests.cs index 85033b6c7..2b054a40d 100644 --- a/src/CommonUtilities.Tests/ArmEnvironmentEndpointsTests.cs +++ b/src/CommonUtilities.Tests/ArmEnvironmentEndpointsTests.cs @@ -147,22 +147,17 @@ private static bool Equals(IReadOnlyDictionary x, T y) [DataRow("AzureChinaCloud", "https://management.chinacloudapi.cn/.default", DisplayName = "AzureChinaCloud")] public async Task FromKnownCloudNameAsync_ExpectedDefaultTokenScope(string cloud, string audience) { - var environment = await AzureCloudConfig.FromKnownCloudNameAsync(cloudName: cloud, retryPolicyOptions: Microsoft.Extensions.Options.Options.Create(new Options.RetryPolicyOptions())); + var environment = await SkipWhenTimeout(AzureCloudConfig.FromKnownCloudNameAsync(cloudName: cloud, retryPolicyOptions: Microsoft.Extensions.Options.Options.Create(new Options.RetryPolicyOptions()))); Assert.AreEqual(audience, GetPropertyFromEnvironment(environment, nameof(AzureCloudConfig.DefaultTokenScope))); } - private static T? GetPropertyFromEnvironment(AzureCloudConfig environment, string property) - { - return (T?)environment.GetType().GetProperty(property)?.GetValue(environment); - } - [DataTestMethod] [DataRow(Cloud.Public, "AzureCloud", DisplayName = "All generally available global Azure regions")] [DataRow(Cloud.USGovernment, "AzureUSGovernment", DisplayName = "Azure Government")] [DataRow(Cloud.China, "AzureChinaCloud", DisplayName = "Microsoft Azure operated by 21Vianet")] public async Task FromKnownCloudNameAsync_ExpectedValues(Cloud cloud, string cloudName) { - var environment = await AzureCloudConfig.FromKnownCloudNameAsync(cloudName: cloudName, retryPolicyOptions: Microsoft.Extensions.Options.Options.Create(new Options.RetryPolicyOptions())); + var environment = await SkipWhenTimeout(AzureCloudConfig.FromKnownCloudNameAsync(cloudName: cloudName, retryPolicyOptions: Microsoft.Extensions.Options.Options.Create(new Options.RetryPolicyOptions()))); foreach (var (property, value) in CloudEndpoints[cloud]) { switch (value) @@ -189,5 +184,23 @@ public async Task FromKnownCloudNameAsync_ExpectedValues(Cloud cloud, string clo } } } + + private static T? GetPropertyFromEnvironment(AzureCloudConfig environment, string property) + { + return (T?)environment.GetType().GetProperty(property)?.GetValue(environment); + } + + private static async Task SkipWhenTimeout(Task task) + { + try + { + return await task; + } + catch (TaskCanceledException e) when (e.InnerException is TimeoutException) + { + Assert.Inconclusive(e.Message); + throw new System.Diagnostics.UnreachableException(); + } + } } } From 9c752950586b1c42d20f719442101b0b6abf7b08 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 11 Dec 2024 10:32:14 -0800 Subject: [PATCH 170/202] Fix metrics.txt path --- src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs b/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs index 6b8ca0a76..97b45bcd8 100644 --- a/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs +++ b/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs @@ -183,8 +183,8 @@ public virtual async Task ToNodeTaskAsync(TesTask task, NodeTaskConver private void AddTaskOutputs(TesTask task, NodeTaskBuilder builder) { - foreach (var (path, url) in new List(["stderr.txt", "stdout.txt", MetricsFileName]) - .Select(file => (Path: $"/{file}", Url: storageAccessProvider.GetInternalTesTaskBlobUrlWithoutSasToken(task, file)))) + foreach (var (path, url) in new List(["stderr.txt", "stdout.txt", $"wd/{MetricsFileName}"]) + .Select(file => (Path: $"/{file}", Url: storageAccessProvider.GetInternalTesTaskBlobUrlWithoutSasToken(task, System.IO.Path.GetFileName(file))))) { builder.WithOutputUsingCombinedTransformationStrategy( AppendParentDirectoryIfSet(path, $"%{NodeTaskBuilder.BatchTaskDirEnvVarName}%"), From e92f63666ed2a08c2d2eb7c60f70579ab6063be5 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 12 Dec 2024 11:25:11 -0800 Subject: [PATCH 171/202] Address flaky unit tests --- src/TesApi.Tests/BatchSchedulerTests.cs | 5 +---- src/TesApi.Web/TaskScheduler.cs | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 4c6b7f5ef..dd206645a 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -1336,9 +1336,6 @@ public async Task PoolIsCreatedWithoutPublicIpWhenSubnetAndDisableBatchNodesPubl return (tesTask.Logs?.LastOrDefault()?.FailureReason, tesTask.Logs?.LastOrDefault()?.SystemLogs?.ToArray()); } - private static Task<(string JobId, IEnumerable CloudTask, BatchAccountPoolData batchModelsPool)> ProcessTesTaskAndGetBatchJobArgumentsAsync() - => ProcessTesTaskAndGetBatchJobArgumentsAsync(GetTesTask(), GetMockConfig()(), GetMockAzureProxy(AzureProxyReturnValues.Defaults), GetMockBatchPoolManager(AzureProxyReturnValues.Defaults), AzureProxyReturnValues.Defaults); - private static Task<(string JobId, IEnumerable CloudTask, BatchAccountPoolData batchModelsPool)> ProcessTesTaskAndGetBatchJobArgumentsAsync(TesTask tesTask, IEnumerable<(string Key, string Value)> configuration, Action> azureProxy, Action> batchPoolManager, AzureProxyReturnValues azureProxyReturnValues, Action additionalActions = default, Action> serviceProviderActions = default) => ProcessTesTasksAndGetBatchJobArgumentsAsync([tesTask], configuration, azureProxy, batchPoolManager, azureProxyReturnValues, additionalActions, serviceProviderActions); @@ -1645,7 +1642,7 @@ public PerformBatchSchedulerBackgroundTasks(IBatchScheduler batchScheduler) : th private readonly async Task RepeatedlyCallPerformBackgroundTasksAsync() { - using PeriodicTimer timer = new(TimeSpan.FromMilliseconds(50)); + using PeriodicTimer timer = new(TimeSpan.FromMilliseconds(750)); while (!cancellationToken.IsCancellationRequested) { diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index 14859a3b9..af2d3c58e 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -58,7 +58,7 @@ internal class TaskScheduler(RunnerEventsProcessor nodeEventProcessor, Microsoft private static readonly TimeSpan queuedRunInterval = TimeSpan.FromMilliseconds(100); internal static readonly TimeSpan BatchRunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs appears to use a 30 second polling interval private static readonly TimeSpan shortBackgroundRunInterval = TimeSpan.FromSeconds(1); - private static readonly TimeSpan longBackgroundRunInterval = TimeSpan.FromSeconds(1); + private static readonly TimeSpan longBackgroundRunInterval = TimeSpan.FromSeconds(2.5); private readonly RunnerEventsProcessor nodeEventProcessor = nodeEventProcessor; /// From f92334edee937c924dd79de6d332e3012e98fe53 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 12 Dec 2024 11:25:48 -0800 Subject: [PATCH 172/202] Apply the specified Workload Identity --- .../AzureServicesConnectionStringCredential.cs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/CommonUtilities/AzureServicesConnectionStringCredential.cs b/src/CommonUtilities/AzureServicesConnectionStringCredential.cs index df7644043..aad87483a 100644 --- a/src/CommonUtilities/AzureServicesConnectionStringCredential.cs +++ b/src/CommonUtilities/AzureServicesConnectionStringCredential.cs @@ -183,7 +183,7 @@ internal Azure.Identity.ManagedIdentityCredential CreateManagedIdentityCredentia internal Azure.Identity.WorkloadIdentityCredential CreateWorkloadIdentityCredential(string appId) { - return new(ConfigureOptions(new Azure.Identity.WorkloadIdentityCredentialOptions())); + return new(ConfigureOptions(new Azure.Identity.WorkloadIdentityCredentialOptions() { ClientId = appId })); } internal Azure.Identity.WorkloadIdentityCredential CreateWorkloadIdentityCredential() @@ -454,7 +454,8 @@ internal static TokenCredential Create(AzureServicesConnectionStringCredentialOp } else if (string.Equals(runAs, Workload, StringComparison.OrdinalIgnoreCase)) { - // If RunAs=Workload use the specified Workload Identity + // RunAs=Workload + // Use the specified Workload Identity // If AppId key is present, use it as the ClientId if (connectionSettings.TryGetValue(AppId, out var appId)) { From f178514adc87002cad155c413f233ba0a67d8031 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 12 Dec 2024 15:21:50 -0800 Subject: [PATCH 173/202] Separate node task outputs from task container(s) outputs --- src/CommonUtilities/Models/NodeTask.cs | 1 + src/Tes.Runner/Executor.cs | 43 +++++++++++++++++++ .../Storage/FileOperationResolver.cs | 19 ++++++++ src/Tes.RunnerCLI/Commands/CommandHandlers.cs | 16 ++++--- .../Runner/TaskToNodeTaskConverterTests.cs | 21 ++++++--- src/TesApi.Web/Runner/NodeTaskBuilder.cs | 26 ++++++----- .../Runner/TaskToNodeTaskConverter.cs | 3 +- 7 files changed, 104 insertions(+), 25 deletions(-) diff --git a/src/CommonUtilities/Models/NodeTask.cs b/src/CommonUtilities/Models/NodeTask.cs index 7784078f6..087201be9 100644 --- a/src/CommonUtilities/Models/NodeTask.cs +++ b/src/CommonUtilities/Models/NodeTask.cs @@ -28,6 +28,7 @@ public class NodeTask public Dictionary? ContainerEnv { get; set; } public List? Inputs { get; set; } public List? Outputs { get; set; } + public List? TaskOutputs { get; set; } public string? MetricsFilename { get; set; } public string? InputsMetricsFormat { get; set; } public string? OutputsMetricsFormat { get; set; } diff --git a/src/Tes.Runner/Executor.cs b/src/Tes.Runner/Executor.cs index 440df85e2..b740c4367 100644 --- a/src/Tes.Runner/Executor.cs +++ b/src/Tes.Runner/Executor.cs @@ -164,6 +164,36 @@ public async Task UploadOutputsAsync(BlobPipelineOptions blobPipelineOptio } } + public async Task UploadTaskOutputsAsync(BlobPipelineOptions blobPipelineOptions) + { + try + { + ArgumentNullException.ThrowIfNull(blobPipelineOptions, nameof(blobPipelineOptions)); + + var outputs = await CreateUploadTaskOutputsAsync(); + + if (outputs is null) + { + return; + } + + if (outputs.Count == 0) + { + logger.LogWarning("No output files were found."); + return; + } + + var optimizedOptions = OptimizeBlobPipelineOptionsForUpload(blobPipelineOptions, outputs); + + _ = await UploadOutputsAsync(optimizedOptions, outputs); + } + catch (Exception e) + { + logger.LogError(e, "Upload operation failed"); + throw; + } + } + private async Task UploadOutputsAsync(BlobPipelineOptions blobPipelineOptions, List outputs) { var uploader = await transferOperationFactory.CreateBlobUploaderAsync(blobPipelineOptions); @@ -188,6 +218,19 @@ private async Task UploadOutputsAsync(BlobPipelineOptions blobPip return await operationResolver.ResolveOutputsAsync(); } + private async Task?> CreateUploadTaskOutputsAsync() + { + if ((tesNodeTask.Outputs ?? []).Count == 0) + { + logger.LogDebug("No outputs provided"); + { + return default; + } + } + + return await operationResolver.ResolveTaskOutputsAsync(); + } + private BlobPipelineOptions OptimizeBlobPipelineOptionsForUpload(BlobPipelineOptions blobPipelineOptions, List outputs) { var optimizedOptions = diff --git a/src/Tes.Runner/Storage/FileOperationResolver.cs b/src/Tes.Runner/Storage/FileOperationResolver.cs index bff96084c..750c17605 100644 --- a/src/Tes.Runner/Storage/FileOperationResolver.cs +++ b/src/Tes.Runner/Storage/FileOperationResolver.cs @@ -52,6 +52,13 @@ public FileOperationResolver(NodeTask nodeTask, ResolutionPolicyHandler resoluti return await resolutionPolicyHandler.ApplyResolutionPolicyAsync(expandedOutputs); } + public virtual async Task?> ResolveTaskOutputsAsync() + { + var expandedOutputs = ExpandTaskOutputs(); + + return await resolutionPolicyHandler.ApplyResolutionPolicyAsync(expandedOutputs); + } + private List ExpandInputs() { List expandedInputs = []; @@ -104,6 +111,18 @@ private List ExpandOutputs() return outputs; } + private List ExpandTaskOutputs() + { + List outputs = []; + + foreach (var output in nodeTask.Outputs ?? []) + { + outputs.AddRange(ExpandOutput(output)); + } + + return outputs; + } + private IEnumerable ExpandOutput(FileOutput output) { ValidateFileOutput(output); diff --git a/src/Tes.RunnerCLI/Commands/CommandHandlers.cs b/src/Tes.RunnerCLI/Commands/CommandHandlers.cs index d7858d614..63f804f7d 100644 --- a/src/Tes.RunnerCLI/Commands/CommandHandlers.cs +++ b/src/Tes.RunnerCLI/Commands/CommandHandlers.cs @@ -54,11 +54,6 @@ internal static async Task ExecuteRootCommandAsync( await ExecuteAllOperationsAsSubProcessesAsync(nodeTask, file, blockSize, writers, readers, bufferCapacity, apiVersion, dockerUri); - { - await using var executor = await Executor.CreateExecutorAsync(nodeTask, apiVersion); - await executor.AppendMetrics(); - } - await eventsPublisher.PublishTaskCompletionEventAsync(nodeTask, duration.Elapsed, EventsPublisher.SuccessStatus, errorMessage: string.Empty); } @@ -240,6 +235,15 @@ private static async Task ExecuteAllOperationsAsSubProcessesAsync(Runner.Models. await CommandLauncher.LaunchesExecutorCommandAsSubProcessAsync(nodeTask, file, apiVersion, dockerUri); await CommandLauncher.LaunchTransferCommandAsSubProcessAsync(CommandFactory.UploadCommandName, nodeTask, file, options); + + await using var executor = await Executor.CreateExecutorAsync(nodeTask, apiVersion); + await executor.AppendMetrics(); + + _ = await ExecuteTransferTaskAsync(nodeTask, async exec => + { + await exec.UploadTaskOutputsAsync(options); + return 0; + }, apiVersion); } private static async Task ExecuteTransferTaskAsync(Runner.Models.NodeTask nodeTask, Func> transferOperation, string apiVersion) @@ -248,7 +252,7 @@ private static async Task ExecuteTransferTaskAsync(Runner.Models.NodeTask n { await using var executor = await Executor.CreateExecutorAsync(nodeTask, apiVersion); - await transferOperation(executor); + _ = await transferOperation(executor); return (int)ProcessExitCode.Success; } diff --git a/src/TesApi.Tests/Runner/TaskToNodeTaskConverterTests.cs b/src/TesApi.Tests/Runner/TaskToNodeTaskConverterTests.cs index 299711e4d..162320c9b 100644 --- a/src/TesApi.Tests/Runner/TaskToNodeTaskConverterTests.cs +++ b/src/TesApi.Tests/Runner/TaskToNodeTaskConverterTests.cs @@ -158,8 +158,9 @@ public async Task var nodeTask = await taskToNodeTaskConverter.ToNodeTaskAsync(tesTask, options, CancellationToken.None); Assert.IsNotNull(nodeTask); Assert.IsNull(nodeTask.Inputs); - Assert.AreEqual(3, nodeTask.Outputs.Count); - Assert.AreEqual(3, nodeTask.Outputs?.Count(output => output.Path!.StartsWith($"%{NodeTaskBuilder.BatchTaskDirEnvVarName}%"))); + Assert.AreEqual(0, nodeTask.Outputs?.Count ?? 0); + Assert.AreEqual(3, nodeTask.TaskOutputs?.Count(output => output.Path!.StartsWith($"%{NodeTaskBuilder.BatchTaskDirEnvVarName}%"))); + Assert.AreEqual(3, nodeTask.TaskOutputs?.Count); } [TestMethod] @@ -176,7 +177,9 @@ public async Task } Assert.AreEqual(5, nodeTask.Outputs?.Count(output => output.Path!.StartsWith(TaskToNodeTaskConverter.BatchTaskWorkingDirEnvVar))); - Assert.AreEqual(3, nodeTask.Outputs?.Count(output => output.Path!.StartsWith($"%{NodeTaskBuilder.BatchTaskDirEnvVarName}%"))); + Assert.AreEqual(3, nodeTask.TaskOutputs?.Count(output => output.Path!.StartsWith($"%{NodeTaskBuilder.BatchTaskDirEnvVarName}%"))); + Assert.AreEqual(5, nodeTask.Outputs?.Count ?? 0); + Assert.AreEqual(3, nodeTask.TaskOutputs?.Count ?? 0); } [TestMethod] @@ -248,7 +251,8 @@ public async Task ToNodeTaskAsync_TesTaskWithNoInputsAndOutputs_NodeTaskContains var nodeTask = await taskToNodeTaskConverter.ToNodeTaskAsync(tesTask, options, CancellationToken.None); Assert.IsNotNull(nodeTask); Assert.IsNull(nodeTask.Inputs); - Assert.AreEqual(0, nodeTask.Outputs?.Count(output => output.Path!.StartsWith(TaskToNodeTaskConverter.BatchTaskWorkingDirEnvVar))); + Assert.AreEqual(0, nodeTask.Outputs?.Count(output => output.Path!.StartsWith(TaskToNodeTaskConverter.BatchTaskWorkingDirEnvVar)) ?? 0); + Assert.AreEqual(0, nodeTask.Outputs?.Count ?? 0); } [TestMethod] @@ -262,7 +266,8 @@ public async Task Assert.IsNotNull(nodeTask); Assert.AreEqual(2, nodeTask.Inputs!.Count); - Assert.AreEqual(0, nodeTask.Outputs?.Count(output => output.Path!.StartsWith(TaskToNodeTaskConverter.BatchTaskWorkingDirEnvVar))); + Assert.AreEqual(0, nodeTask.Outputs?.Count(output => output.Path!.StartsWith(TaskToNodeTaskConverter.BatchTaskWorkingDirEnvVar)) ?? 0); + Assert.AreEqual(0, nodeTask.Outputs?.Count ?? 0); } [TestMethod] @@ -275,7 +280,8 @@ public async Task var nodeTask = await taskToNodeTaskConverter.ToNodeTaskAsync(tesTask, options, CancellationToken.None); Assert.IsNotNull(nodeTask); Assert.IsNull(nodeTask.Inputs); - Assert.AreEqual(0, nodeTask.Outputs?.Count(output => output.Path!.StartsWith(TaskToNodeTaskConverter.BatchTaskWorkingDirEnvVar))); + Assert.AreEqual(0, nodeTask.Outputs?.Count(output => output.Path!.StartsWith(TaskToNodeTaskConverter.BatchTaskWorkingDirEnvVar)) ?? 0); + Assert.AreEqual(0, nodeTask.Outputs?.Count ?? 0); } [TestMethod] @@ -299,7 +305,8 @@ public async Task var nodeTask = await taskToNodeTaskConverter.ToNodeTaskAsync(tesTask, options, CancellationToken.None); Assert.IsNotNull(nodeTask); Assert.IsNull(nodeTask.Inputs); - Assert.AreEqual(0, nodeTask.Outputs?.Count(output => output.Path!.StartsWith(TaskToNodeTaskConverter.BatchTaskWorkingDirEnvVar))); + Assert.AreEqual(0, nodeTask.Outputs?.Count(output => output.Path!.StartsWith(TaskToNodeTaskConverter.BatchTaskWorkingDirEnvVar)) ?? 0); + Assert.AreEqual(0, nodeTask.Outputs?.Count ?? 0); } [TestMethod] diff --git a/src/TesApi.Web/Runner/NodeTaskBuilder.cs b/src/TesApi.Web/Runner/NodeTaskBuilder.cs index 9d614dcdd..273ca42f7 100644 --- a/src/TesApi.Web/Runner/NodeTaskBuilder.cs +++ b/src/TesApi.Web/Runner/NodeTaskBuilder.cs @@ -149,22 +149,26 @@ public NodeTaskBuilder WithInputUsingCombinedTransformationStrategy(string path, /// /// /// + /// Host task output if True, container task output if False. /// public NodeTaskBuilder WithOutputUsingCombinedTransformationStrategy(string path, string targetUrl, - FileType? fileType) + FileType? fileType, bool taskOutputs = false) { ArgumentException.ThrowIfNullOrEmpty(path, nameof(path)); ArgumentException.ThrowIfNullOrEmpty(targetUrl, nameof(targetUrl)); - nodeTask.Outputs ??= []; - nodeTask.Outputs.Add( - new FileOutput() - { - Path = path, - TargetUrl = targetUrl, - TransformationStrategy = GetCombinedTransformationStrategyFromRuntimeOptions(), - FileType = fileType ?? FileType.File - } - ); + + var outputs = taskOutputs + ? nodeTask.TaskOutputs ??= [] + : nodeTask.Outputs ??= []; + + outputs.Add(new FileOutput() + { + Path = path, + TargetUrl = targetUrl, + TransformationStrategy = GetCombinedTransformationStrategyFromRuntimeOptions(), + FileType = fileType ?? FileType.File + }); + return this; } diff --git a/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs b/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs index 97b45bcd8..921ee64b2 100644 --- a/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs +++ b/src/TesApi.Web/Runner/TaskToNodeTaskConverter.cs @@ -189,7 +189,8 @@ private void AddTaskOutputs(TesTask task, NodeTaskBuilder builder) builder.WithOutputUsingCombinedTransformationStrategy( AppendParentDirectoryIfSet(path, $"%{NodeTaskBuilder.BatchTaskDirEnvVarName}%"), url.AbsoluteUri, - fileType: FileType.File); + fileType: FileType.File, + taskOutputs: true); } } From 6c79c8f17ea48c1544c99badf5c4d2ed9840cd6a Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 18 Dec 2024 19:14:39 -0800 Subject: [PATCH 174/202] various small fixes --- src/Tes.RunnerCLI/Commands/CommandHandlers.cs | 16 +-- src/TesApi.Web/BatchPool.cs | 12 +- src/TesApi.Web/BatchPoolService.cs | 111 ------------------ src/TesApi.Web/TaskScheduler.cs | 74 +++++++----- 4 files changed, 60 insertions(+), 153 deletions(-) delete mode 100644 src/TesApi.Web/BatchPoolService.cs diff --git a/src/Tes.RunnerCLI/Commands/CommandHandlers.cs b/src/Tes.RunnerCLI/Commands/CommandHandlers.cs index 63f804f7d..bbd971a82 100644 --- a/src/Tes.RunnerCLI/Commands/CommandHandlers.cs +++ b/src/Tes.RunnerCLI/Commands/CommandHandlers.cs @@ -236,14 +236,16 @@ private static async Task ExecuteAllOperationsAsSubProcessesAsync(Runner.Models. await CommandLauncher.LaunchTransferCommandAsSubProcessAsync(CommandFactory.UploadCommandName, nodeTask, file, options); - await using var executor = await Executor.CreateExecutorAsync(nodeTask, apiVersion); - await executor.AppendMetrics(); - - _ = await ExecuteTransferTaskAsync(nodeTask, async exec => + try { - await exec.UploadTaskOutputsAsync(options); - return 0; - }, apiVersion); + await using var executor = await Executor.CreateExecutorAsync(nodeTask, apiVersion); + await executor.AppendMetrics(); + await executor.UploadTaskOutputsAsync(options); + } + catch (Exception e) + { + Logger.LogError(e, "Failed to perform transfer. Operation: {TransferOperation}", nameof(Executor.UploadTaskOutputsAsync)); + } } private static async Task ExecuteTransferTaskAsync(Runner.Models.NodeTask nodeTask, Func> transferOperation, string apiVersion) diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index fb795f3df..780ce9f63 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -678,14 +678,12 @@ async ValueTask ParseBatchException(BatchException batchException, Cancell private const string EjectableComputeNodesFilterClause = @"state eq 'starttaskfailed' or state eq 'preempted' or state eq 'unusable'"; private string EjectableComputeNodesSelectClause() - => ScalingMode.AutoScaleDisabled.Equals(_scalingMode) switch + => (ScalingMode.AutoScaleDisabled.Equals(_scalingMode), _taskPreviousComputeNodeIds.Count == 0) switch { - false => _taskPreviousComputeNodeIds.Count == 0 // Not removing compute nodes - ? @"id" // Not servicing tasks by compute node - : @"errors,id,state", // Servicing tasks by compute node - true => _taskPreviousComputeNodeIds.Count == 0 // Possibly removing compute nodes - ? @"id,state,startTaskInfo" // Not servicing tasks by compute node - : @"errors,id,state,startTaskInfo", // Servicing tasks by compute node + (false, false) => @"errors,id,state", // Not removing compute nodes. Servicing tasks by compute node + (false, true) => @"id", // Not removing compute nodes. Not servicing tasks by compute node + (true, false) => @"errors,id,state,startTaskInfo", // Possibly removing compute nodes. Servicing tasks by compute node + (true, true) => @"id,state,startTaskInfo", // Possibly removing compute nodes. Not servicing tasks by compute node }; private async ValueTask> GetNodesToRemove() diff --git a/src/TesApi.Web/BatchPoolService.cs b/src/TesApi.Web/BatchPoolService.cs deleted file mode 100644 index d8702315a..000000000 --- a/src/TesApi.Web/BatchPoolService.cs +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System; -using System.Linq; -using System.Threading; -using System.Threading.Tasks; -using Microsoft.Extensions.Hosting; -using Microsoft.Extensions.Logging; - -namespace TesApi.Web -{ - /// - /// A background service that montitors CloudPools in the batch system, orchestrates their lifecycle, and updates their state. - /// This should only be used as a system-wide singleton service. This class does not support scale-out on multiple machines, - /// nor does it implement a leasing mechanism. In the future, consider using the Lease Blob operation. - /// - public class BatchPoolService : BackgroundService - { - private readonly IBatchScheduler _batchScheduler; - private readonly ILogger _logger; - - /// - /// Interval between each call to . - /// - public static readonly TimeSpan RunInterval = TimeSpan.FromSeconds(30); - - /// - /// Default constructor - /// - /// - /// - /// - public BatchPoolService(IBatchScheduler batchScheduler, ILogger logger) - { - _batchScheduler = batchScheduler ?? throw new ArgumentNullException(nameof(batchScheduler)); - _logger = logger ?? throw new ArgumentNullException(nameof(logger)); - } - - /// - public override Task StartAsync(CancellationToken cancellationToken) - { - _logger.LogInformation("Batch Pools starting..."); - return base.StartAsync(cancellationToken); - } - - /// - public override Task StopAsync(CancellationToken cancellationToken) - { - _logger.LogInformation("Batch Pools stopping..."); - return base.StopAsync(cancellationToken); - } - - /// - protected override async Task ExecuteAsync(CancellationToken stoppingToken) - { - _logger.LogInformation("Batch Pools started."); - _batchScheduler.LoadExistingPoolsAsync(stoppingToken).Wait(stoppingToken); // Delay starting Scheduler until this completes to finish initializing BatchScheduler. - - while (!stoppingToken.IsCancellationRequested) - { - try - { - await ServiceBatchPools(stoppingToken); - await Task.Delay(RunInterval, stoppingToken); - } - catch (OperationCanceledException) when (stoppingToken.IsCancellationRequested) - { - break; - } - catch (Exception exc) - { - _logger.LogError(exc, @"{ExceptionMessage}", exc.Message); - } - } - - _logger.LogInformation("Batch Pools gracefully stopped."); - } - - /// - /// Retrieves all batch pools from the database and affords an opportunity to react to changes. - /// - /// A System.Threading.CancellationToken for controlling the lifetime of the asynchronous operation. - /// - private async ValueTask ServiceBatchPools(CancellationToken cancellationToken) - { - var pools = _batchScheduler.GetPools().ToList(); - - if (0 == pools.Count) - { - return; - } - - var startTime = DateTime.UtcNow; - - foreach (var pool in pools) - { - try - { - await pool.ServicePoolAsync(cancellationToken); - } - catch (Exception exc) - { - _logger.LogError(exc, "Batch pool {PoolId} threw an exception in ServiceBatchPools.", pool.PoolId); - } - } - - _logger.LogDebug(@"ServiceBatchPools for {PoolsCount} pools completed in {TotalSeconds} seconds.", pools.Count, DateTime.UtcNow.Subtract(startTime).TotalSeconds); - } - } -} diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index af2d3c58e..631de7ec9 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -56,7 +56,7 @@ internal class TaskScheduler(RunnerEventsProcessor nodeEventProcessor, Microsoft { private static readonly TimeSpan blobRunInterval = TimeSpan.FromSeconds(15); private static readonly TimeSpan queuedRunInterval = TimeSpan.FromMilliseconds(100); - internal static readonly TimeSpan BatchRunInterval = TimeSpan.FromSeconds(30); // The very fastest process inside of Azure Batch accessing anything within pools or jobs appears to use a 30 second polling interval + internal static readonly TimeSpan BatchRunInterval = TimeSpan.FromSeconds(30); // The very fastest processes inside of Azure Batch accessing anything within pools or jobs appears to use a 30 second polling interval private static readonly TimeSpan shortBackgroundRunInterval = TimeSpan.FromSeconds(1); private static readonly TimeSpan longBackgroundRunInterval = TimeSpan.FromSeconds(2.5); private readonly RunnerEventsProcessor nodeEventProcessor = nodeEventProcessor; @@ -315,24 +315,25 @@ await ParseAvailableEvents(token), /// s and s from all events. private async ValueTask MarkProcessedAsync)>> ParseAvailableEvents(CancellationToken cancellationToken) { + var tasks = new ConcurrentDictionary>(StringComparer.OrdinalIgnoreCase); // TODO: Are tesTask.Ids case sensitive? var messages = new ConcurrentBag<(RunnerEventsMessage Message, TesTask Task, AzureBatchTaskState State, Func MarkProcessedAsync)>(); - // Get and parse event blobs + // Get tasks for event blobs await Parallel.ForEachAsync(BatchScheduler.GetEventMessagesAsync(cancellationToken), cancellationToken, async (eventMessage, token) => { - var tesTask = await GetTesTaskAsync(eventMessage.Tags["task-id"], eventMessage.Tags["event-name"]); - - if (tesTask is null) - { - return; - } + TesTask tesTask = default; try { + tesTask = await GetTesTaskAsync(eventMessage.Tags["task-id"], eventMessage.Tags["event-name"]); + + if (tesTask is null) + { + return; + } + nodeEventProcessor.ValidateMessageMetadata(eventMessage); - eventMessage = await nodeEventProcessor.DownloadAndValidateMessageContentAsync(eventMessage, token); - var state = await nodeEventProcessor.GetMessageBatchStateAsync(eventMessage, tesTask, token); - messages.Add((eventMessage, tesTask, state, ct => nodeEventProcessor.MarkMessageProcessedAsync(eventMessage, ct))); + tasks.AddOrUpdate(tesTask.Id, _ => [(eventMessage, tesTask)], (_, list) => list.Add((eventMessage, tesTask))); } catch (OperationCanceledException) when (token.IsCancellationRequested) { @@ -352,22 +353,6 @@ await Parallel.ForEachAsync(BatchScheduler.GetEventMessagesAsync(cancellationTok ]), ct => nodeEventProcessor.RemoveMessageFromReattemptsAsync(eventMessage, ct))); } - catch (Exception ex) - { - Logger.LogError(ex, @"Downloading and parsing event failed: {ErrorMessage}", ex.Message); - - messages.Add(( - eventMessage, - tesTask, - new(AzureBatchTaskState.TaskState.InfoUpdate, Warning: - [ - "EventParsingFailed", - $"{ex.GetType().FullName}: {ex.Message}" - ]), - (ex is System.Diagnostics.UnreachableException || ex is RunnerEventsProcessor.DownloadOrParseException) - ? ct => nodeEventProcessor.MarkMessageProcessedAsync(eventMessage, ct) // Mark event processed to prevent retries - : default)); // Retry this event. - } // Helpers async ValueTask GetTesTaskAsync(string id, string @event) @@ -375,7 +360,7 @@ async ValueTask GetTesTaskAsync(string id, string @event) TesTask tesTask = default; if (await Repository.TryGetItemAsync(id, token, task => tesTask = task) && tesTask is not null) { - Logger.LogTrace("Completing event '{TaskEvent}' for task {TesTask}.", @event, tesTask.Id); + Logger.LogTrace("Attempting to complete event '{TaskEvent}' for task {TesTask}.", @event, tesTask.Id); return tesTask; } else @@ -386,6 +371,39 @@ async ValueTask GetTesTaskAsync(string id, string @event) } }); + // Parse event blobs, deferring later events for the same TesTask + await Parallel.ForEachAsync(tasks.Select(pair => nodeEventProcessor.OrderProcessedByExecutorSequence(pair.Value, m => m.Event).First()), cancellationToken, async (tuple, token) => + { + var (eventMessage, tesTask) = tuple; + + try + { + eventMessage = await nodeEventProcessor.DownloadAndValidateMessageContentAsync(eventMessage, token); + var state = await nodeEventProcessor.GetMessageBatchStateAsync(eventMessage, tesTask, token); + messages.Add((eventMessage, tesTask, state, ct => nodeEventProcessor.MarkMessageProcessedAsync(eventMessage, ct))); + } + catch (OperationCanceledException) when (token.IsCancellationRequested) + { + throw; + } + catch (Exception ex) + { + Logger.LogError(ex, @"Downloading and parsing event failed: {ErrorMessage}", ex.Message); + + messages.Add(( + eventMessage, + tesTask, + new(AzureBatchTaskState.TaskState.InfoUpdate, Warning: + [ + "EventParsingFailed", + $"{ex.GetType().FullName}: {ex.Message}" + ]), + (ex is System.Diagnostics.UnreachableException || ex is RunnerEventsProcessor.DownloadOrParseException || ex is ArgumentException) + ? ct => nodeEventProcessor.MarkMessageProcessedAsync(eventMessage, ct) // Mark event processed to prevent retries + : default)); // Retry this event. + } + }); + return nodeEventProcessor.OrderProcessedByExecutorSequence(messages, @event => @event.Message).Select(@event => (@event.Task, @event.State, @event.MarkProcessedAsync)); } From 7d26ab2459a452440645b655ea8fdf689c3c758a Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 19 Dec 2024 10:57:58 -0800 Subject: [PATCH 175/202] Actually upload task files --- .../Storage/FileOperationResolver.cs | 2 +- src/Tes.RunnerCLI/Commands/CommandHandlers.cs | 27 +++++++++++-------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/src/Tes.Runner/Storage/FileOperationResolver.cs b/src/Tes.Runner/Storage/FileOperationResolver.cs index 750c17605..937ade59f 100644 --- a/src/Tes.Runner/Storage/FileOperationResolver.cs +++ b/src/Tes.Runner/Storage/FileOperationResolver.cs @@ -115,7 +115,7 @@ private List ExpandTaskOutputs() { List outputs = []; - foreach (var output in nodeTask.Outputs ?? []) + foreach (var output in nodeTask.TaskOutputs ?? []) { outputs.AddRange(ExpandOutput(output)); } diff --git a/src/Tes.RunnerCLI/Commands/CommandHandlers.cs b/src/Tes.RunnerCLI/Commands/CommandHandlers.cs index bbd971a82..a65437487 100644 --- a/src/Tes.RunnerCLI/Commands/CommandHandlers.cs +++ b/src/Tes.RunnerCLI/Commands/CommandHandlers.cs @@ -230,21 +230,26 @@ private static async Task ExecuteAllOperationsAsSubProcessesAsync(Runner.Models. BlobPipelineOptionsConverter.ToBlobPipelineOptions(blockSize, writers, readers, bufferCapacity, apiVersion); - await CommandLauncher.LaunchTransferCommandAsSubProcessAsync(CommandFactory.DownloadCommandName, nodeTask, file, options); - - await CommandLauncher.LaunchesExecutorCommandAsSubProcessAsync(nodeTask, file, apiVersion, dockerUri); - - await CommandLauncher.LaunchTransferCommandAsSubProcessAsync(CommandFactory.UploadCommandName, nodeTask, file, options); - try { - await using var executor = await Executor.CreateExecutorAsync(nodeTask, apiVersion); - await executor.AppendMetrics(); - await executor.UploadTaskOutputsAsync(options); + await CommandLauncher.LaunchTransferCommandAsSubProcessAsync(CommandFactory.DownloadCommandName, nodeTask, file, options); + + await CommandLauncher.LaunchesExecutorCommandAsSubProcessAsync(nodeTask, file, apiVersion, dockerUri); + + await CommandLauncher.LaunchTransferCommandAsSubProcessAsync(CommandFactory.UploadCommandName, nodeTask, file, options); } - catch (Exception e) + finally { - Logger.LogError(e, "Failed to perform transfer. Operation: {TransferOperation}", nameof(Executor.UploadTaskOutputsAsync)); + try + { + await using var executor = await Executor.CreateExecutorAsync(nodeTask, apiVersion); + await executor.AppendMetrics(); + await executor.UploadTaskOutputsAsync(options); + } + catch (Exception e) + { + Logger.LogError(e, "Failed to perform transfer. Operation: {TransferOperation}", nameof(Executor.UploadTaskOutputsAsync)); + } } } From a2e051e3c7630e85621f4a51c3d0162fe64a47a5 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 20 Dec 2024 17:24:54 -0800 Subject: [PATCH 176/202] Address various regressions --- src/Tes/Models/BatchNodeMetrics.cs | 14 ++-- src/TesApi.Tests/BatchSchedulerTests.cs | 17 ++--- src/TesApi.Web/AzureProxy.cs | 2 +- src/TesApi.Web/BatchPool.cs | 67 +++++++++++-------- src/TesApi.Web/BatchScheduler.cs | 19 +++--- .../CachingWithRetriesAzureProxy.cs | 4 +- src/TesApi.Web/IBatchPool.cs | 5 +- .../CachingWithRetriesBatchPoolManager.cs | 2 +- src/TesApi.Web/PoolScheduler.cs | 3 +- 9 files changed, 69 insertions(+), 64 deletions(-) diff --git a/src/Tes/Models/BatchNodeMetrics.cs b/src/Tes/Models/BatchNodeMetrics.cs index dffec437b..d6778f93c 100644 --- a/src/Tes/Models/BatchNodeMetrics.cs +++ b/src/Tes/Models/BatchNodeMetrics.cs @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +using System.Collections.Generic; + namespace Tes.Models { /// @@ -8,17 +10,11 @@ namespace Tes.Models /// public class BatchNodeMetrics { - /// - /// BlobXfer docker image pull duration - /// - [TesTaskLogMetadataKey("blobxfer_pull_duration_sec")] - public double? BlobXferImagePullDurationInSeconds { get; set; } - /// /// Executor docker image pull duration /// [TesTaskLogMetadataKey("executor_pull_duration_sec")] - public double? ExecutorImagePullDurationInSeconds { get; set; } + public List ExecutorImagePullDurationInSeconds { get; set; } /// /// File download duration @@ -30,7 +26,7 @@ public class BatchNodeMetrics /// Main command execution duration /// [TesTaskLogMetadataKey("executor_duration_sec")] - public double? ExecutorDurationInSeconds { get; set; } + public List ExecutorDurationInSeconds { get; set; } /// /// File upload duration @@ -42,7 +38,7 @@ public class BatchNodeMetrics /// Executor image size in GB /// [TesTaskLogMetadataKey("executor_image_size_gb")] - public double? ExecutorImageSizeInGB { get; set; } + public List ExecutorImageSizeInGB { get; set; } /// /// File download size in GB diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index dd206645a..0ddafe3d9 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -1074,14 +1074,12 @@ public async Task SuccessfullyCompletedTaskContainsBatchNodeMetrics() tesTask.State = TesState.INITIALIZING; var metricsFileContent = @" - BlobXferPullStart=2020-10-08T02:30:39+00:00 - BlobXferPullEnd=2020-10-08T02:31:39+00:00 - ExecutorPullStart=2020-10-08T02:32:39+00:00 + ExecutorPullStart=2020-10-08T02:35:39+00:00 ExecutorImageSizeInBytes=3000000000 - ExecutorPullEnd=2020-10-08T02:34:39+00:00 - DownloadStart=2020-10-08T02:35:39+00:00 + ExecutorPullEnd=2020-10-08T02:37:39+00:00 + DownloadStart=2020-10-08T02:30:39+00:00 FileDownloadSizeInBytes=2000000000 - DownloadEnd=2020-10-08T02:38:39+00:00 + DownloadEnd=2020-10-08T02:33:39+00:00 ExecutorStart=2020-10-08T02:39:39+00:00 ExecutorEnd=2020-10-08T02:43:39+00:00 UploadStart=2020-10-08T02:44:39+00:00 @@ -1105,11 +1103,10 @@ public async Task SuccessfullyCompletedTaskContainsBatchNodeMetrics() var batchNodeMetrics = tesTask.GetOrAddTesTaskLog().BatchNodeMetrics; Assert.IsNotNull(batchNodeMetrics); - Assert.AreEqual(60, batchNodeMetrics.BlobXferImagePullDurationInSeconds); - Assert.AreEqual(120, batchNodeMetrics.ExecutorImagePullDurationInSeconds); - Assert.AreEqual(3, batchNodeMetrics.ExecutorImageSizeInGB); + Assert.AreEqual(120, batchNodeMetrics.ExecutorImagePullDurationInSeconds[0]); + Assert.AreEqual(3, batchNodeMetrics.ExecutorImageSizeInGB[0]); Assert.AreEqual(180, batchNodeMetrics.FileDownloadDurationInSeconds); - Assert.AreEqual(240, batchNodeMetrics.ExecutorDurationInSeconds); + Assert.AreEqual(240, batchNodeMetrics.ExecutorDurationInSeconds[0]); Assert.AreEqual(300, batchNodeMetrics.FileUploadDurationInSeconds); Assert.AreEqual(1.024, batchNodeMetrics.DiskUsedInGB); Assert.AreEqual(12.5f, batchNodeMetrics.DiskUsedPercent); diff --git a/src/TesApi.Web/AzureProxy.cs b/src/TesApi.Web/AzureProxy.cs index 4f41efb8d..c32b15104 100644 --- a/src/TesApi.Web/AzureProxy.cs +++ b/src/TesApi.Web/AzureProxy.cs @@ -219,7 +219,7 @@ public async Task TerminateBatchTaskAsync(string tesTaskId, string jobId, Cancel { batchTasksToTerminate = await batchClient.JobOperations.ListTasks(jobId, jobFilter).ToAsyncEnumerable().ToListAsync(cancellationToken); } - catch (BatchException ex) when (ex.InnerException is BatchProtocol.Models.BatchErrorException bee && "JobNotFound".Equals(bee.Body?.Code, StringComparison.InvariantCultureIgnoreCase)) + catch (BatchException ex) when (BatchErrorCodeStrings.JobNotFound.Equals(ex.RequestInformation.BatchError.Code, StringComparison.OrdinalIgnoreCase)) { logger.LogWarning("Job not found for TES task {TesTask}", tesTaskId); return; // Task cannot exist if the job is not found. diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 780ce9f63..65ce04fd2 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -68,10 +68,7 @@ public BatchPool(IBatchScheduler batchScheduler, IOptions GetTasksAsync(string select, string filter) => _removedFromService ? AsyncEnumerable.Empty() : _azureProxy.ListTasksAsync(PoolId, new ODATADetailLevel { SelectClause = select, FilterClause = filter }); - internal IAsyncEnumerable GetTasksAsync(bool includeCompleted) - => _azureProxy.ListTasksAsync(PoolId, new ODATADetailLevel { SelectClause = "id,stateTransitionTime", FilterClause = includeCompleted ? default : "state ne 'completed'" }); - - private async ValueTask RemoveNodesAsync(IList nodesToRemove, CancellationToken cancellationToken) + private async ValueTask RemoveNodesAsync(List nodesToRemove, CancellationToken cancellationToken) { _logger.LogDebug("Removing {Nodes} nodes from {PoolId}", nodesToRemove.Count, PoolId); await _azureProxy.DeleteBatchComputeNodesAsync(PoolId, nodesToRemove, cancellationToken); @@ -339,7 +336,7 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc case ScalingMode.AutoScaleDisabled: { - var nodesToRemove = Enumerable.Empty(); + List nodesToRemove = []; // It's documented that a max of 100 nodes can be removed at a time. Excess eligible nodes will be removed in a future call to this method. await foreach (var node in (await GetNodesToRemove()).Take(MaxComputeNodesToRemoveAtOnce).WithCancellation(cancellationToken)) @@ -352,7 +349,7 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc case ComputeNodeState.StartTaskFailed: _logger.LogTrace("Found starttaskfailed node {NodeId}", node.Id); - StartTaskFailures.Enqueue(new(node.Id, node.StartTaskInformation.FailureInformation)); + StartTaskFailures.Enqueue(new(PoolId, node.Id, node.StartTaskInformation.FailureInformation)); break; case ComputeNodeState.Preempted: @@ -363,25 +360,29 @@ private async ValueTask ServicePoolManagePoolScalingAsync(CancellationToken canc throw new System.Diagnostics.UnreachableException($"Unexpected compute node state '{node.State}' received while looking for nodes to remove from the pool."); } - nodesToRemove = nodesToRemove.Append(node); + nodesToRemove.Add(node); } - nodesToRemove = nodesToRemove.ToList(); - - if (nodesToRemove.Any()) + if (nodesToRemove.Count == 0) + { + goto case ScalingMode.RemovingFailedNodes; + } + else { await nodesToRemove .Where(node => ComputeNodeState.StartTaskFailed.Equals(node.State)) .SelectMany(node => [(node, "stdout.txt"), (node, "stderr.txt")]) - .ForEachAsync((logInfo, token) => TransferStartTaskLogAsync(logInfo.Node, logInfo.Log, token), cancellationToken); - await RemoveNodesAsync((IList)nodesToRemove, cancellationToken); + .ForEachAsync(async (nodeAndLog, token) => + { + var file = await nodeAndLog.Node.GetNodeFileAsync($"startup/{nodeAndLog.Log}", cancellationToken: token); + var content = await file.ReadAsStringAsync(cancellationToken: token); + var blobUri = await _storageAccessProvider.GetInternalTesBlobUrlAsync($"/pools/{PoolId}/nodes/{nodeAndLog.Node.Id}/{nodeAndLog.Log}", Azure.Storage.Sas.BlobSasPermissions.Create, token); + await _azureProxy.UploadBlobAsync(blobUri, content, token); + }, cancellationToken); + await RemoveNodesAsync(nodesToRemove, cancellationToken); _resetAutoScalingRequired = false; _scalingMode = ScalingMode.RemovingFailedNodes; } - else - { - goto case ScalingMode.RemovingFailedNodes; - } } break; @@ -406,14 +407,6 @@ await nodesToRemove _logger.LogInformation(@"Pool {PoolId} is back to normal resize and monitoring status.", PoolId); break; } - - async ValueTask TransferStartTaskLogAsync(ComputeNode node, string log, CancellationToken cancellationToken) - { - var file = await node.GetNodeFileAsync($"startup/{log}", cancellationToken: cancellationToken); - var content = await file.ReadAsStringAsync(cancellationToken: cancellationToken); - var blobUri = await _storageAccessProvider.GetInternalTesBlobUrlAsync($"/pools/{PoolId}/nodes/{node.Id}/{log}", Azure.Storage.Sas.BlobSasPermissions.Create, cancellationToken); - await _azureProxy.UploadBlobAsync(blobUri, content, cancellationToken); - } } } @@ -503,12 +496,21 @@ public async ValueTask CanBeDeletedAsync(CancellationToken cancellationTok return true; } - if (await GetTasksAsync("id", default).AnyAsync(cancellationToken)) + try { + if (await GetTasksAsync("id", default).AnyAsync(cancellationToken)) + { + return false; + } + + return true; + } + catch (BatchException ex) when (BatchErrorCodeStrings.JobNotFound.Equals(ex.RequestInformation.BatchError.Code)) + { + IsAvailable = false; + //_removedFromService = true; return false; } - - return true; } /// @@ -558,7 +560,16 @@ ValueTask StandupQueries() // List tasks from batch just one time each time we service the pool when called from PoolScheduler _foundTasks.Clear(); - _foundTasks.AddRange(GetTasksAsync("creationTime,executionInfo,id,nodeInfo,state,stateTransitionTime", null).ToBlockingEnumerable(cancellationToken)); + + try + { + _foundTasks.AddRange(GetTasksAsync("creationTime,executionInfo,id,nodeInfo,state,stateTransitionTime", null).ToBlockingEnumerable(cancellationToken)); + } + catch (BatchException ex) when (BatchErrorCodeStrings.JobNotFound.Equals(ex.RequestInformation.BatchError.Code)) + { + IsAvailable = false; + } + _logger.LogTrace("{PoolId}: {TaskCount} tasks discovered.", PoolId, _foundTasks.Count); // List nodes from Batch at most one time each time we service the pool diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 8aa5b0730..7786fa9f5 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -1604,13 +1604,13 @@ private static bool IsThereSufficientCoreQuota(BatchVmCoreQuota coreQuota, Virtu static bool TryGetValueAsDateTimeOffset(Dictionary dict, string key, out DateTimeOffset result) { result = default; - return dict.TryGetValue(key, out var valueAsString) && DateTimeOffset.TryParse(valueAsString, out result); + return dict.TryGetValue(key, out var valueAsString) && DateTimeOffset.TryParse(valueAsString.Trim(), out result); } static bool TryGetValueAsDouble(Dictionary dict, string key, out double result) { result = default; - return dict.TryGetValue(key, out var valueAsString) && double.TryParse(valueAsString, out result); + return dict.TryGetValue(key, out var valueAsString) && double.TryParse(valueAsString.Trim(), out result); } Tes.Models.BatchNodeMetrics batchNodeMetrics = null; @@ -1644,21 +1644,20 @@ static bool TryGetValueAsDouble(Dictionary dict, string key, out batchNodeMetrics = new() { - BlobXferImagePullDurationInSeconds = GetDurationInSeconds(metrics, "BlobXferPullStart", "BlobXferPullEnd"), - ExecutorImagePullDurationInSeconds = GetDurationInSeconds(metrics, "ExecutorPullStart", "ExecutorPullEnd"), - ExecutorImageSizeInGB = TryGetValueAsDouble(metrics, "ExecutorImageSizeInBytes", out var executorImageSizeInBytes) ? executorImageSizeInBytes / bytesInGB : null, - FileDownloadDurationInSeconds = GetDurationInSeconds(metrics, "DownloadStart", "DownloadEnd"), + /**/ExecutorImagePullDurationInSeconds = [GetDurationInSeconds(metrics, "ExecutorPullStart", "ExecutorPullEnd")], + /**/ExecutorImageSizeInGB = [TryGetValueAsDouble(metrics, "ExecutorImageSizeInBytes", out var executorImageSizeInBytes) ? executorImageSizeInBytes / bytesInGB : null], + /**/FileDownloadDurationInSeconds = GetDurationInSeconds(metrics, "DownloadStart", "DownloadEnd"), FileDownloadSizeInGB = TryGetValueAsDouble(metrics, "FileDownloadSizeInBytes", out var fileDownloadSizeInBytes) ? fileDownloadSizeInBytes / bytesInGB : null, - ExecutorDurationInSeconds = GetDurationInSeconds(metrics, "ExecutorStart", "ExecutorEnd"), - FileUploadDurationInSeconds = GetDurationInSeconds(metrics, "UploadStart", "UploadEnd"), + /**/ExecutorDurationInSeconds = [GetDurationInSeconds(metrics, "ExecutorStart", "ExecutorEnd")], + /**/FileUploadDurationInSeconds = GetDurationInSeconds(metrics, "UploadStart", "UploadEnd"), FileUploadSizeInGB = TryGetValueAsDouble(metrics, "FileUploadSizeInBytes", out var fileUploadSizeInBytes) ? fileUploadSizeInBytes / bytesInGB : null, DiskUsedInGB = diskUsedInGB, DiskUsedPercent = diskUsedInGB.HasValue && diskSizeInGB.HasValue && diskSizeInGB > 0 ? (float?)(diskUsedInGB / diskSizeInGB * 100) : null, VmCpuModelName = metrics.GetValueOrDefault("VmCpuModelName") }; - taskStartTime = TryGetValueAsDateTimeOffset(metrics, "BlobXferPullStart", out var startTime) ? startTime : null; - taskEndTime = TryGetValueAsDateTimeOffset(metrics, "UploadEnd", out var endTime) ? endTime : null; + /**/taskStartTime = TryGetValueAsDateTimeOffset(metrics, "DownloadStart", out var startTime) ? startTime : null; + /**/taskEndTime = TryGetValueAsDateTimeOffset(metrics, "UploadEnd", out var endTime) ? endTime : null; } catch (Exception ex) { diff --git a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs index d95713130..c18e66187 100644 --- a/src/TesApi.Web/CachingWithRetriesAzureProxy.cs +++ b/src/TesApi.Web/CachingWithRetriesAzureProxy.cs @@ -78,7 +78,7 @@ async Task IAzureProxy.DeleteBatchTaskAsync(string cloudTaskId, string jobId, Ca { await cachingAsyncRetryExceptWhenNotFound.ExecuteWithRetryAsync(ct => azureProxy.DeleteBatchTaskAsync(cloudTaskId, jobId, ct), cancellationToken); } - catch (BatchException exc) when (BatchErrorCodeStrings.TaskNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) + catch (BatchException exc) when (BatchErrorCodeStrings.TaskNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase) || BatchErrorCodeStrings.JobNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } } @@ -89,7 +89,7 @@ public async Task TerminateBatchTaskAsync(string tesTaskId, string jobId, Cancel { await cachingAsyncRetryExceptWhenNotFound.ExecuteWithRetryAsync(ct => azureProxy.TerminateBatchTaskAsync(tesTaskId, jobId, ct), cancellationToken); } - catch (BatchException exc) when (BatchErrorCodeStrings.TaskNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) + catch (BatchException exc) when (BatchErrorCodeStrings.TaskNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase) || BatchErrorCodeStrings.JobNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } } diff --git a/src/TesApi.Web/IBatchPool.cs b/src/TesApi.Web/IBatchPool.cs index 6dd65c4b7..5bc21ba0a 100644 --- a/src/TesApi.Web/IBatchPool.cs +++ b/src/TesApi.Web/IBatchPool.cs @@ -105,8 +105,9 @@ public record struct CloudTaskWithPreviousComputeNodeId(CloudTask CloudTask, str /// /// paired with compute node Id. /// - /// Compute node Id + /// . + /// . /// - public record class StartTaskFailureInformation(string NodeId, TaskFailureInformation TaskFailureInformation); + public record class StartTaskFailureInformation(string PoolId, string NodeId, TaskFailureInformation TaskFailureInformation); } } diff --git a/src/TesApi.Web/Management/Batch/CachingWithRetriesBatchPoolManager.cs b/src/TesApi.Web/Management/Batch/CachingWithRetriesBatchPoolManager.cs index 7301894db..6973a1e7e 100644 --- a/src/TesApi.Web/Management/Batch/CachingWithRetriesBatchPoolManager.cs +++ b/src/TesApi.Web/Management/Batch/CachingWithRetriesBatchPoolManager.cs @@ -41,7 +41,7 @@ async Task IBatchPoolManager.DeleteBatchPoolAsync(string poolId, CancellationTok { await cachingAsyncRetryExceptWhenNotFound.ExecuteWithRetryAsync(ct => batchPoolManager.DeleteBatchPoolAsync(poolId, ct), cancellationToken); } - catch (BatchException exc) when (BatchErrorCodeStrings.PoolNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) + catch (BatchException exc) when (exc.RequestInformation.HttpStatusCode == System.Net.HttpStatusCode.NotFound || BatchErrorCodeStrings.PoolNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) { } } diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index 4a7030837..9395bb4f5 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -353,7 +353,8 @@ AzureBatchTaskState ConvertFromStartTask(IBatchPool.StartTaskFailureInformation => new(AzureBatchTaskState.TaskState.NodeStartTaskFailed, Failure: new(failure.TaskFailureInformation.Code, Enumerable.Empty() .Append($"Start task failed ({failure.TaskFailureInformation.Category}): {failure.TaskFailureInformation.Message}") .Concat(failure.TaskFailureInformation.Details?.Select(FormatNameValuePair) ?? []) - .Append(failure.NodeId))); + .Append($"tes-internal/pools/{failure.PoolId}/nodes/{failure.NodeId}/stderr.txt") + .Append($"tes-internal/pools/{failure.PoolId}/nodes/{failure.NodeId}/stdout.txt"))); ResizeError PopNextResizeError() => pool.ResizeErrors.TryDequeue(out var resizeError) ? resizeError : default; From d200a63d22b9c4bd4b67e473748cc07dd0689750 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 26 Dec 2024 08:09:29 -0800 Subject: [PATCH 177/202] formatting --- src/TesApi.Web/BatchScheduler.cs | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 7786fa9f5..548879c72 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -1644,20 +1644,27 @@ static bool TryGetValueAsDouble(Dictionary dict, string key, out batchNodeMetrics = new() { - /**/ExecutorImagePullDurationInSeconds = [GetDurationInSeconds(metrics, "ExecutorPullStart", "ExecutorPullEnd")], - /**/ExecutorImageSizeInGB = [TryGetValueAsDouble(metrics, "ExecutorImageSizeInBytes", out var executorImageSizeInBytes) ? executorImageSizeInBytes / bytesInGB : null], - /**/FileDownloadDurationInSeconds = GetDurationInSeconds(metrics, "DownloadStart", "DownloadEnd"), + /* TODO: Not currently implemented */ + ExecutorImagePullDurationInSeconds = [GetDurationInSeconds(metrics, "ExecutorPullStart", "ExecutorPullEnd")], + /* TODO: Not currently implemented */ + ExecutorImageSizeInGB = [TryGetValueAsDouble(metrics, "ExecutorImageSizeInBytes", out var executorImageSizeInBytes) ? executorImageSizeInBytes / bytesInGB : null], + /* TODO: Not currently implemented */ + FileDownloadDurationInSeconds = GetDurationInSeconds(metrics, "DownloadStart", "DownloadEnd"), FileDownloadSizeInGB = TryGetValueAsDouble(metrics, "FileDownloadSizeInBytes", out var fileDownloadSizeInBytes) ? fileDownloadSizeInBytes / bytesInGB : null, - /**/ExecutorDurationInSeconds = [GetDurationInSeconds(metrics, "ExecutorStart", "ExecutorEnd")], - /**/FileUploadDurationInSeconds = GetDurationInSeconds(metrics, "UploadStart", "UploadEnd"), + /* TODO: Not currently implemented */ + ExecutorDurationInSeconds = [GetDurationInSeconds(metrics, "ExecutorStart", "ExecutorEnd")], + /* TODO: Not currently implemented */ + FileUploadDurationInSeconds = GetDurationInSeconds(metrics, "UploadStart", "UploadEnd"), FileUploadSizeInGB = TryGetValueAsDouble(metrics, "FileUploadSizeInBytes", out var fileUploadSizeInBytes) ? fileUploadSizeInBytes / bytesInGB : null, DiskUsedInGB = diskUsedInGB, DiskUsedPercent = diskUsedInGB.HasValue && diskSizeInGB.HasValue && diskSizeInGB > 0 ? (float?)(diskUsedInGB / diskSizeInGB * 100) : null, VmCpuModelName = metrics.GetValueOrDefault("VmCpuModelName") }; - /**/taskStartTime = TryGetValueAsDateTimeOffset(metrics, "DownloadStart", out var startTime) ? startTime : null; - /**/taskEndTime = TryGetValueAsDateTimeOffset(metrics, "UploadEnd", out var endTime) ? endTime : null; + /* TODO: Not currently implemented */ + taskStartTime = TryGetValueAsDateTimeOffset(metrics, "DownloadStart", out var startTime) ? startTime : null; + /* TODO: Not currently implemented */ + taskEndTime = TryGetValueAsDateTimeOffset(metrics, "UploadEnd", out var endTime) ? endTime : null; } catch (Exception ex) { From 60ea49eec099d3960984855f775f6178aff3f861 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 26 Dec 2024 08:09:44 -0800 Subject: [PATCH 178/202] Address failure to load TES task from repository --- src/Tes/Models/BatchNodeMetrics.cs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/Tes/Models/BatchNodeMetrics.cs b/src/Tes/Models/BatchNodeMetrics.cs index d6778f93c..bad232456 100644 --- a/src/Tes/Models/BatchNodeMetrics.cs +++ b/src/Tes/Models/BatchNodeMetrics.cs @@ -3,6 +3,8 @@ using System.Collections.Generic; +#nullable enable + namespace Tes.Models { /// @@ -14,7 +16,7 @@ public class BatchNodeMetrics /// Executor docker image pull duration /// [TesTaskLogMetadataKey("executor_pull_duration_sec")] - public List ExecutorImagePullDurationInSeconds { get; set; } + public List? ExecutorImagePullDurationInSeconds { get; set; } /// /// File download duration @@ -26,7 +28,7 @@ public class BatchNodeMetrics /// Main command execution duration /// [TesTaskLogMetadataKey("executor_duration_sec")] - public List ExecutorDurationInSeconds { get; set; } + public List? ExecutorDurationInSeconds { get; set; } /// /// File upload duration @@ -38,7 +40,7 @@ public class BatchNodeMetrics /// Executor image size in GB /// [TesTaskLogMetadataKey("executor_image_size_gb")] - public List ExecutorImageSizeInGB { get; set; } + public List? ExecutorImageSizeInGB { get; set; } /// /// File download size in GB @@ -80,6 +82,6 @@ public class BatchNodeMetrics /// CPU Model Name /// [TesTaskLogMetadataKey("vm_cpu_model_name")] - public string VmCpuModelName { get; set; } + public string? VmCpuModelName { get; set; } } } From ff752760633304b3354c40ed9a7c3e8fc49ffe23 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 26 Dec 2024 14:03:09 -0800 Subject: [PATCH 179/202] attempt to address serialization failure --- src/Tes/Models/TesTaskLogExtended.cs | 35 +++++++++++++++------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/Tes/Models/TesTaskLogExtended.cs b/src/Tes/Models/TesTaskLogExtended.cs index 9762eb13a..a8098c44b 100644 --- a/src/Tes/Models/TesTaskLogExtended.cs +++ b/src/Tes/Models/TesTaskLogExtended.cs @@ -79,8 +79,10 @@ internal void StoreObjectsToMetadata(StreamingContext context) internal void LoadObjectsFromMetadata(StreamingContext context) { LoadObjectFromMetadata(this); - this.VirtualMachineInfo = TryGetObjectFromMetadata(out var vmInfo) ? vmInfo : null; - this.BatchNodeMetrics = TryGetObjectFromMetadata(out var metrics) ? metrics : null; + var vmInfo = this.VirtualMachineInfo; + this.VirtualMachineInfo = TryGetObjectFromMetadata(ref vmInfo) ? vmInfo : null; + var metrics = this.BatchNodeMetrics; + this.BatchNodeMetrics = TryGetObjectFromMetadata(ref metrics) ? metrics : null; } /// @@ -118,11 +120,10 @@ private void LoadObjectFromMetadata(T obj) /// /// Type of object to populate and return /// Populated object - private bool TryGetObjectFromMetadata(out T obj) where T : new() + private bool TryGetObjectFromMetadata(ref T obj) where T : new() { if (this.Metadata is null) { - obj = default; return false; } @@ -131,12 +132,11 @@ private void LoadObjectFromMetadata(T obj) if (this.Metadata.Keys.Intersect(metadataKeyNames).Any()) { - obj = new T(); + obj ??= new T(); LoadObjectFromMetadata(obj); return true; } - obj = default; return false; } @@ -149,19 +149,22 @@ private void LoadObjectFromMetadata(T obj) /// True if metadata value is found. private bool TryGetMetadataValue(string key, Type type, out object result) { - string value = null; - var hasValue = this.Metadata is not null && this.Metadata.TryGetValue(key, out value); - - if (type == typeof(IEnumerable)) - { - result = hasValue ? JsonConvert.DeserializeObject(value, type) : default; - } - else + if (this.Metadata is not null && this.Metadata.TryGetValue(key, out var value)) { - result = hasValue ? JsonConvert.DeserializeObject($"\"{value}\"", type) : default; + if (typeof(System.Collections.IEnumerable).IsAssignableFrom(type) && !typeof(string).IsAssignableFrom(type)) + { + result = JsonConvert.DeserializeObject(value, type); + } + else + { + result = JsonConvert.DeserializeObject($"\"{value}\"", type); + } + + return true; } - return hasValue; + result = default; + return false; } /// From 06400eea35e4708aa5c667b050d9ace5842a8d35 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 31 Dec 2024 05:55:44 -0800 Subject: [PATCH 180/202] Fail tasks when associated pool or job is removed --- src/TesApi.Tests/BatchSchedulerTests.cs | 4 ++-- src/TesApi.Web/BatchPool.cs | 20 +++++++++----------- src/TesApi.Web/IBatchPool.cs | 3 ++- src/TesApi.Web/PoolScheduler.cs | 21 ++++++++++++++++++++- 4 files changed, 33 insertions(+), 15 deletions(-) diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 0ddafe3d9..0269f2416 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -63,7 +63,7 @@ public async Task GetOrAddDoesNotAddExistingAvailablePool() serviceProvider.BatchPoolManager.Verify(mock => mock.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny()), Times.Once); var pool = await batchScheduler.GetOrAddPoolAsync(key, false, (id, cancellationToken) => ValueTask.FromResult(BatchPoolTests.CreatePoolData(name: id)), CancellationToken.None); - await pool.ServicePoolAsync(); + _ = await pool.ServicePoolAsync(); Assert.AreEqual(count, batchScheduler.GetPools().Count()); Assert.AreEqual(keyCount, batchScheduler.GetPoolGroupKeys().Count()); @@ -85,7 +85,7 @@ public async Task GetOrAddDoesAddWithExistingUnavailablePool() var count = batchScheduler.GetPools().Count(); var pool = await batchScheduler.GetOrAddPoolAsync(key, false, (id, cancellationToken) => ValueTask.FromResult(BatchPoolTests.CreatePoolData(name: id)), CancellationToken.None); - await pool.ServicePoolAsync(); + _ = await pool.ServicePoolAsync(); Assert.AreNotEqual(count, batchScheduler.GetPools().Count()); Assert.AreEqual(keyCount, batchScheduler.GetPoolGroupKeys().Count()); diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index 65ce04fd2..c4b2f666b 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -446,6 +446,7 @@ public sealed partial class BatchPool : IBatchPool { private static readonly SemaphoreSlim lockObj = new(1, 1); private bool _removedFromService = false; + private readonly List _orphanedTaskIds = []; /// /// Types of maintenance calls offered by the service method. @@ -545,7 +546,7 @@ internal async ValueTask ServicePoolAsync(ServiceKind serviceKind, CancellationT } /// - public async ValueTask ServicePoolAsync(CancellationToken cancellationToken) + public async ValueTask> ServicePoolAsync(CancellationToken cancellationToken) { ValueTask StandupQueries() { @@ -586,6 +587,7 @@ ValueTask StandupQueries() return ValueTask.CompletedTask; } + _orphanedTaskIds.Clear(); var exceptions = new List(); // Run each servicing task serially and accumulate the exception, except whenever the pool or the job are not found @@ -595,17 +597,12 @@ await PerformTask(ServicePoolAsync(ServiceKind.ManagePoolScaling, cancellationTo await PerformTask(ServicePoolAsync(ServiceKind.Rotate, cancellationToken), cancellationToken) && await PerformTask(ServicePoolAsync(ServiceKind.RemovePoolIfEmpty, cancellationToken), cancellationToken); - switch (exceptions.Count) + return exceptions.Count switch { - case 0: - return; - - case 1: - throw exceptions.First(); - - default: - throw new AggregateException(exceptions.SelectMany(Flatten)); - } + 0 => _orphanedTaskIds, + 1 => throw exceptions.First(), + _ => throw new AggregateException(exceptions.SelectMany(Flatten)), + }; static IEnumerable Flatten(Exception ex) => ex switch @@ -673,6 +670,7 @@ async ValueTask ParseBatchException(BatchException batchException, Cancell { _logger.LogError(batchException, "Batch pool and/or job {PoolId} is missing. Removing them from TES's active pool list.", PoolId); _ = _batchPools.RemovePoolFromList(this); + AssociatedTesTasks.ForEach(entry => _orphanedTaskIds.Add(entry.Key)); await _batchPools.DeletePoolAndJobAsync(this, cancellationToken); // TODO: Consider moving any remaining tasks to another pool, or failing job/tasks explicitly return true; } diff --git a/src/TesApi.Web/IBatchPool.cs b/src/TesApi.Web/IBatchPool.cs index 5bc21ba0a..5066b292d 100644 --- a/src/TesApi.Web/IBatchPool.cs +++ b/src/TesApi.Web/IBatchPool.cs @@ -68,8 +68,9 @@ public interface IBatchPool /// Updates this instance based on changes to its environment. /// /// + /// s of orphaned tasks. /// Calls each internal servicing method in order. Throws all exceptions gathered from all methods. - ValueTask ServicePoolAsync(CancellationToken cancellationToken = default); + ValueTask> ServicePoolAsync(CancellationToken cancellationToken = default); /// /// Lists s running in pool's job. diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index 9395bb4f5..1f61f46d5 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -70,7 +70,26 @@ await ExecuteActionOnIntervalAsync( async token => await ExecuteActionOnPoolsAsync( async (pool, token) => { - await pool.ServicePoolAsync(token); + var orphanedTaskIds = await pool.ServicePoolAsync(token); + + ConcurrentBag tasks = []; + await orphanedTaskIds.ToAsyncEnumerable() + .ForEachAwaitWithCancellationAsync(async (id, cancellation) => await Repository.TryGetItemAsync(id, cancellation, task => + { + if (task.IsActiveState(preemptedIsTerminal: true) && pool.PoolId.Equals(task.PoolId, StringComparison.OrdinalIgnoreCase)) + { + tasks.Add(task); + } + }), token); + + var now = DateTimeOffset.UtcNow; + await OrchestrateTesTasksOnBatchAsync( + $"OrphanedTasks ({pool.PoolId})", + _ => ValueTask.FromResult(tasks.ToAsyncEnumerable()), + (tesTasks, token) => TaskScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, tesTasks.Select(_ => new AzureBatchTaskState(AzureBatchTaskState.TaskState.CompletedWithErrors, BatchTaskEndTime: now, Failure: new("RemovedPoolOrJob", ["Batch pool or job was removed."]))).ToArray(), token), + ex => { Logger.LogError(ex, "Repository collision while failing task ('{TesTask}') due to pool or job removal.", ex.RepositoryItem?.Id ?? ""); return ValueTask.CompletedTask; }, + cancellationToken); + await ProcessTasksAsync(pool, DateTime.UtcNow, pool.ListCloudTasksAsync(), token); }, token), From d7ea8f81499140dcbc92c5814c61fe6a066e2fad Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 2 Jan 2025 09:25:01 -0800 Subject: [PATCH 181/202] handle repository collisions while processing orphaned tasks --- src/TesApi.Web/PoolScheduler.cs | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index 1f61f46d5..393e21aaa 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -71,8 +71,8 @@ await ExecuteActionOnIntervalAsync( async (pool, token) => { var orphanedTaskIds = await pool.ServicePoolAsync(token); + List tasks = []; - ConcurrentBag tasks = []; await orphanedTaskIds.ToAsyncEnumerable() .ForEachAwaitWithCancellationAsync(async (id, cancellation) => await Repository.TryGetItemAsync(id, cancellation, task => { @@ -83,12 +83,23 @@ await orphanedTaskIds.ToAsyncEnumerable() }), token); var now = DateTimeOffset.UtcNow; - await OrchestrateTesTasksOnBatchAsync( - $"OrphanedTasks ({pool.PoolId})", - _ => ValueTask.FromResult(tasks.ToAsyncEnumerable()), - (tesTasks, token) => TaskScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, tesTasks.Select(_ => new AzureBatchTaskState(AzureBatchTaskState.TaskState.CompletedWithErrors, BatchTaskEndTime: now, Failure: new("RemovedPoolOrJob", ["Batch pool or job was removed."]))).ToArray(), token), - ex => { Logger.LogError(ex, "Repository collision while failing task ('{TesTask}') due to pool or job removal.", ex.RepositoryItem?.Id ?? ""); return ValueTask.CompletedTask; }, - cancellationToken); + ConcurrentBag tasks2 = []; + + while (tasks.Count != 0) + { + await OrchestrateTesTasksOnBatchAsync( + $"OrphanedTasks ({pool.PoolId})", + _ => ValueTask.FromResult(tasks.ToAsyncEnumerable()), + (tesTasks, token) => TaskScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, tesTasks.Select(_ => new AzureBatchTaskState(AzureBatchTaskState.TaskState.CompletedWithErrors, BatchTaskEndTime: now, Failure: new("RemovedPoolOrJob", ["Batch pool or job was removed."]))).ToArray(), token), + ex => { Logger.LogError(ex, "Repository collision while failing task ('{TesTask}') due to pool or job removal.", ex.RepositoryItem?.Id ?? ""); tasks2.Add(ex.RepositoryItem); return ValueTask.CompletedTask; }, + cancellationToken); + + if (tasks.Count != 0) + { + tasks = [.. tasks2.Where(task => task is not null)]; + tasks2.Clear(); + } + } await ProcessTasksAsync(pool, DateTime.UtcNow, pool.ListCloudTasksAsync(), token); }, From 4126820ed7102ae8d2d37392628ce730881d7aa4 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 6 Jan 2025 17:09:51 -0800 Subject: [PATCH 182/202] Handle orphaned tasks --- src/TesApi.Tests/BatchPoolTests.cs | 26 +++-- src/TesApi.Tests/BatchSchedulerTests.cs | 105 ++++++++++++++++-- src/TesApi.Web/BatchPool.cs | 20 ++-- src/TesApi.Web/BatchScheduler.cs | 11 ++ src/TesApi.Web/IBatchPool.cs | 3 +- .../OrchestrateOnBatchSchedulerServiceBase.cs | 4 +- src/TesApi.Web/PoolScheduler.cs | 32 +----- src/TesApi.Web/TaskScheduler.cs | 27 ++++- 8 files changed, 164 insertions(+), 64 deletions(-) diff --git a/src/TesApi.Tests/BatchPoolTests.cs b/src/TesApi.Tests/BatchPoolTests.cs index 2005fb96c..807c3b836 100644 --- a/src/TesApi.Tests/BatchPoolTests.cs +++ b/src/TesApi.Tests/BatchPoolTests.cs @@ -5,6 +5,7 @@ using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; +using Azure.ResourceManager.Models; using Microsoft.Azure.Batch; using Microsoft.VisualStudio.TestTools.UnitTesting; using Moq; @@ -427,7 +428,14 @@ internal static AutoScaleRun GenerateAutoScaleRun(Microsoft.Azure.Batch.Protocol return autoScaleRun; } - + internal static CloudJob GenerateJob(string id, Microsoft.Azure.Batch.Protocol.Models.JobState state, Microsoft.Azure.Batch.Protocol.Models.PoolInformation poolInfo) + { + var parentClient = CreateMockBatchClient(); + Microsoft.Azure.Batch.Protocol.Models.CloudJob modelJob = new(id: id, state: state, poolInfo: poolInfo); + var job = (CloudJob)typeof(CloudJob).GetConstructor(System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance, default, [typeof(BatchClient), typeof(Microsoft.Azure.Batch.Protocol.Models.CloudJob), typeof(IEnumerable)], default) + .Invoke([parentClient, modelJob, null]); + return job; + } internal static CloudPool GeneratePool( string id, @@ -442,7 +450,8 @@ internal static CloudPool GeneratePool( bool? enableAutoScale = default, DateTime? creationTime = default, IList metadata = default, - IList dataDisks = default) + IList dataDisks = default, + Microsoft.Azure.Batch.Protocol.Models.BatchPoolIdentity identity = default) { if (default == creationTime) { @@ -454,18 +463,19 @@ internal static CloudPool GeneratePool( var parentClient = CreateMockBatchClient(); Microsoft.Azure.Batch.Protocol.Models.CloudPool modelPool = new( id: id, + creationTime: creationTime, + allocationState: (Microsoft.Azure.Batch.Protocol.Models.AllocationState)allocationState, + allocationStateTransitionTime: allocationStateTransitionTime, + virtualMachineConfiguration: new() { DataDisks = dataDisks?.Select(ConvertDataDisk).ToList() ?? [] }, + resizeErrors: resizeErrors, currentDedicatedNodes: currentDedicatedNodes, currentLowPriorityNodes: currentLowPriorityNodes, targetDedicatedNodes: targetDedicatedNodes, targetLowPriorityNodes: targetLowPriorityNodes, - allocationState: (Microsoft.Azure.Batch.Protocol.Models.AllocationState)allocationState, - allocationStateTransitionTime: allocationStateTransitionTime, - resizeErrors: resizeErrors, - autoScaleRun: autoScaleRun, enableAutoScale: enableAutoScale, - creationTime: creationTime, + autoScaleRun: autoScaleRun, metadata: metadata.Select(ConvertMetadata).ToList(), - virtualMachineConfiguration: new() { DataDisks = dataDisks?.Select(ConvertDataDisk).ToList() ?? [] }); + identity: identity); var pool = (CloudPool)typeof(CloudPool).GetConstructor(System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance, default, [typeof(BatchClient), typeof(Microsoft.Azure.Batch.Protocol.Models.CloudPool), typeof(IEnumerable)], default) .Invoke([parentClient, modelPool, null]); return pool; diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index 0269f2416..0073f37f4 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -32,6 +32,8 @@ namespace TesApi.Tests [TestClass] public class BatchSchedulerTests { + private const string GlobalManagedIdentity = "/subscriptions/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee/resourceGroups/SomeResourceGroup/providers/Microsoft.ManagedIdentity/userAssignedIdentities/GlobalManagedIdentity"; + [TestMethod] public async Task LocalPoolCacheAccessesNewPoolsAfterAllPoolsRemovedWithSameKey() { @@ -63,7 +65,7 @@ public async Task GetOrAddDoesNotAddExistingAvailablePool() serviceProvider.BatchPoolManager.Verify(mock => mock.CreateBatchPoolAsync(It.IsAny(), It.IsAny(), It.IsAny()), Times.Once); var pool = await batchScheduler.GetOrAddPoolAsync(key, false, (id, cancellationToken) => ValueTask.FromResult(BatchPoolTests.CreatePoolData(name: id)), CancellationToken.None); - _ = await pool.ServicePoolAsync(); + await pool.ServicePoolAsync(); Assert.AreEqual(count, batchScheduler.GetPools().Count()); Assert.AreEqual(keyCount, batchScheduler.GetPoolGroupKeys().Count()); @@ -85,7 +87,7 @@ public async Task GetOrAddDoesAddWithExistingUnavailablePool() var count = batchScheduler.GetPools().Count(); var pool = await batchScheduler.GetOrAddPoolAsync(key, false, (id, cancellationToken) => ValueTask.FromResult(BatchPoolTests.CreatePoolData(name: id)), CancellationToken.None); - _ = await pool.ServicePoolAsync(); + await pool.ServicePoolAsync(); Assert.AreNotEqual(count, batchScheduler.GetPools().Count()); Assert.AreEqual(keyCount, batchScheduler.GetPoolGroupKeys().Count()); @@ -1011,18 +1013,43 @@ public async Task TaskThatFailsWithNodeAllocationErrorIsRequeuedOnDifferentVmSiz [TestMethod] public async Task TaskGetsCancelled() { - var tesTask = new TesTask { Id = "test", PoolId = "pool1", State = TesState.CANCELING, Logs = [new()] }; + var poolId = "pool1-1"; + var tesTask = new TesTask { Id = "test", State = TesState.CANCELING, Logs = [new()] }; var azureProxyReturnValues = AzureProxyReturnValues.Defaults; azureProxyReturnValues.BatchTaskState = BatchTaskStates.CancellationRequested[0]; + + BatchAccountPoolData poolInfo = new() + { + DeploymentConfiguration = new() + { + VmConfiguration = new BatchVmConfiguration(new BatchImageReference(), "batchNodeAgent"), + }, + Identity = new(Azure.ResourceManager.Models.ManagedServiceIdentityType.UserAssigned), + }; + + poolInfo.Identity.UserAssignedIdentities.Add(new(new(GlobalManagedIdentity), new())); + poolInfo.Metadata.Add(new("CoA-TES-Metadata", @"{""hostName"":""hostname"",""isDedicated"":true,""RunnerMD5"":""MD5"",""eventsVersion"":{""EventVersion"":""1.0"",""TesTaskRunnerEntityType"":""TesRunnerTask"",""EventDataVersion"":""1.0""}}")); + poolInfo.Metadata.Add(new(string.Empty, poolId)); + tesTask.PoolId = azureProxyReturnValues.CreateBatchPoolImpl(poolInfo); + var job = BatchPoolTests.GenerateJob(tesTask.PoolId, Microsoft.Azure.Batch.Protocol.Models.JobState.Active, new(tesTask.PoolId)); + Mock azureProxy = default; var azureProxySetter = new Action>(mock => { GetMockAzureProxy(azureProxyReturnValues)(mock); + mock.Setup(m => m.GetBatchJobAsync(poolId, It.IsAny(), It.IsAny())) + .Returns(Task.FromResult(job)); azureProxy = mock; }); - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, GetMockBatchPoolManager(azureProxyReturnValues), azureProxyReturnValues); + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync( + tesTask, + GetMockConfig()(), + azureProxySetter, + GetMockBatchPoolManager(azureProxyReturnValues), + azureProxyReturnValues, + serviceProviderActions: (_, scheduler) => scheduler.LoadExistingPoolsAsync(CancellationToken.None).GetAwaiter().GetResult()); GuardAssertsWithTesTask(tesTask, () => { @@ -1183,7 +1210,7 @@ public async Task CromwellWriteFilesAreDiscoveredAndAddedIfMissedWithContentScri Uri executionDirectoryUri = default; - _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, GetMockBatchPoolManager(azureProxyReturnValues), azureProxyReturnValues, serviceProviderActions: serviceProvider => + _ = await ProcessTesTaskAndGetBatchJobArgumentsAsync(tesTask, GetMockConfig()(), azureProxySetter, GetMockBatchPoolManager(azureProxyReturnValues), azureProxyReturnValues, serviceProviderActions: (serviceProvider, _) => { var storageAccessProvider = serviceProvider.GetServiceOrCreateInstance(); @@ -1333,10 +1360,10 @@ public async Task PoolIsCreatedWithoutPublicIpWhenSubnetAndDisableBatchNodesPubl return (tesTask.Logs?.LastOrDefault()?.FailureReason, tesTask.Logs?.LastOrDefault()?.SystemLogs?.ToArray()); } - private static Task<(string JobId, IEnumerable CloudTask, BatchAccountPoolData batchModelsPool)> ProcessTesTaskAndGetBatchJobArgumentsAsync(TesTask tesTask, IEnumerable<(string Key, string Value)> configuration, Action> azureProxy, Action> batchPoolManager, AzureProxyReturnValues azureProxyReturnValues, Action additionalActions = default, Action> serviceProviderActions = default) + private static Task<(string JobId, IEnumerable CloudTask, BatchAccountPoolData batchModelsPool)> ProcessTesTaskAndGetBatchJobArgumentsAsync(TesTask tesTask, IEnumerable<(string Key, string Value)> configuration, Action> azureProxy, Action> batchPoolManager, AzureProxyReturnValues azureProxyReturnValues, Action additionalActions = default, Action, IBatchScheduler> serviceProviderActions = default) => ProcessTesTasksAndGetBatchJobArgumentsAsync([tesTask], configuration, azureProxy, batchPoolManager, azureProxyReturnValues, additionalActions, serviceProviderActions); - private static async Task<(string JobId, IEnumerable CloudTasks, BatchAccountPoolData batchModelsPool)> ProcessTesTasksAndGetBatchJobArgumentsAsync(TesTask[] tesTasks, IEnumerable<(string Key, string Value)> configuration, Action> azureProxy, Action> batchPoolManager, AzureProxyReturnValues azureProxyReturnValues, Action additionalActions = default, Action> serviceProviderActions = default) + private static async Task<(string JobId, IEnumerable CloudTasks, BatchAccountPoolData batchModelsPool)> ProcessTesTasksAndGetBatchJobArgumentsAsync(TesTask[] tesTasks, IEnumerable<(string Key, string Value)> configuration, Action> azureProxy, Action> batchPoolManager, AzureProxyReturnValues azureProxyReturnValues, Action additionalActions = default, Action, IBatchScheduler> serviceProviderActions = default) { using var serviceProvider = GetServiceProvider( configuration, @@ -1347,7 +1374,7 @@ public async Task PoolIsCreatedWithoutPublicIpWhenSubnetAndDisableBatchNodesPubl GetMockAllowedVms(configuration), additionalActions: additionalActions); var batchScheduler = serviceProvider.GetT(); - serviceProviderActions?.Invoke(serviceProvider); + serviceProviderActions?.Invoke(serviceProvider, batchScheduler); if (azureProxyReturnValues.BatchTaskState is null) { @@ -1491,7 +1518,7 @@ private static Action> GetMockAzureProxy(AzureProxyReturnValue .ReturnsAsync(true); azureProxy.Setup(a => a.GetActivePoolsAsync(It.IsAny())) - .Returns(AsyncEnumerable.Empty()); + .Returns(azureProxyReturnValues.GetCloudPools().ToAsyncEnumerable()); azureProxy.Setup(a => a.GetStorageAccountInfoAsync("defaultstorageaccount", It.IsAny())) .Returns(Task.FromResult(azureProxyReturnValues.StorageAccountInfos["defaultstorageaccount"])); @@ -1536,7 +1563,7 @@ private static Action> GetMockAzureProxy(AzureProxyReturnValue => new(() => [ ("Storage:DefaultAccountName", "defaultstorageaccount"), - ("BatchNodes:GlobalManagedIdentity", "/subscriptions/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee/resourceGroups/SomeResourceGroup/providers/Microsoft.ManagedIdentity/userAssignedIdentities/GlobalManagedIdentity"), + ("BatchNodes:GlobalManagedIdentity", GlobalManagedIdentity), ("BatchScheduling:Prefix", "hostname"), ("BatchImageGen1:Offer", "ubuntu-server-container"), ("BatchImageGen1:Publisher", "microsoft-azure-batch"), @@ -1762,6 +1789,18 @@ internal string CreateBatchPoolImpl(BatchAccountPoolData pool) pool.Metadata.Remove(poolNameItem); var poolId = poolNameItem.Value; + if (pool.Identity is not null) + { + PoolIdentityValues identityItem = new(pool.Identity.ManagedServiceIdentityType switch + { + var x when x == Azure.ResourceManager.Models.ManagedServiceIdentityType.None => Microsoft.Azure.Batch.Protocol.Models.PoolIdentityType.None, + var x when x == Azure.ResourceManager.Models.ManagedServiceIdentityType.UserAssigned => Microsoft.Azure.Batch.Protocol.Models.PoolIdentityType.UserAssigned, + _ => throw new ArgumentOutOfRangeException(nameof(pool), $"{nameof(BatchAccountPoolData.Identity)}.{nameof(BatchAccountPoolData.Identity.ManagedServiceIdentityType)} has an unsupported value.") + }, pool.Identity.UserAssignedIdentities?.Select(item => new UserAssignedIdentityValues(item.Key.ToString(), item.Value?.ClientId.ToString(), item.Value?.PrincipalId.ToString())).ToArray()); + + pool.Metadata.Add(new(nameof(BatchAccountPoolData.Identity), System.Text.Json.JsonSerializer.Serialize(identityItem, serializerOptions.Value))); + } + poolMetadata.AddOrUpdate(poolId, _ => pool.Metadata?.Select(Convert).ToList(), (_, _) => throw new Exception("Unexpected attempt to modify pool.")); return poolId; @@ -1776,8 +1815,52 @@ internal CloudPool GetBatchPoolImpl(string poolId) items = null; } - return BatchPoolTests.GeneratePool(poolId, metadata: items); + items = items?.ToList(); + + var identityItem = items?.SingleOrDefault(item => nameof(BatchAccountPoolData.Identity).Equals(item.Name, StringComparison.Ordinal)); + + if (identityItem is not null) + { + items.Remove(identityItem); + } + + return BatchPoolTests.GeneratePool(poolId, metadata: items, identity: GetIdentity(identityItem?.Value)); + + static Microsoft.Azure.Batch.Protocol.Models.BatchPoolIdentity GetIdentity(string item) + { + if (item is null) + { + return default; + } + + var identityItem = System.Text.Json.JsonSerializer.Deserialize(item, serializerOptions.Value); + + return new() + { + Type = identityItem.Type, + UserAssignedIdentities = identityItem.UserAssignedIdentites.Select(uai => new Microsoft.Azure.Batch.Protocol.Models.UserAssignedIdentity(uai.ResourceId, uai.ClientId, uai.PrincipalId)).ToList() + }; + } } + + private static readonly Lazy serializerOptions = new(() => + { + System.Text.Json.JsonSerializerOptions options = new() + { + //DefaultIgnoreCondition = System.Text.Json.Serialization.JsonIgnoreCondition.WhenWritingDefault, + PropertyNamingPolicy = System.Text.Json.JsonNamingPolicy.CamelCase + }; + + options.Converters.Add(new System.Text.Json.Serialization.JsonStringEnumConverter(System.Text.Json.JsonNamingPolicy.CamelCase, true)); + + return options; + }); + + internal IEnumerable GetCloudPools() + => poolMetadata.Select(pool => pool.Key).Select(pool => GetBatchPoolImpl(pool)); + + private record struct UserAssignedIdentityValues(string ResourceId, string ClientId, string PrincipalId); + private record struct PoolIdentityValues(Microsoft.Azure.Batch.Protocol.Models.PoolIdentityType Type, UserAssignedIdentityValues[] UserAssignedIdentites); } private static BatchVmFamilyCoreQuota CreateBatchVmFamilyCoreQuota(string name, int? quota) diff --git a/src/TesApi.Web/BatchPool.cs b/src/TesApi.Web/BatchPool.cs index c4b2f666b..e47a139ca 100644 --- a/src/TesApi.Web/BatchPool.cs +++ b/src/TesApi.Web/BatchPool.cs @@ -446,7 +446,6 @@ public sealed partial class BatchPool : IBatchPool { private static readonly SemaphoreSlim lockObj = new(1, 1); private bool _removedFromService = false; - private readonly List _orphanedTaskIds = []; /// /// Types of maintenance calls offered by the service method. @@ -546,7 +545,7 @@ internal async ValueTask ServicePoolAsync(ServiceKind serviceKind, CancellationT } /// - public async ValueTask> ServicePoolAsync(CancellationToken cancellationToken) + public async ValueTask ServicePoolAsync(CancellationToken cancellationToken) { ValueTask StandupQueries() { @@ -587,7 +586,6 @@ ValueTask StandupQueries() return ValueTask.CompletedTask; } - _orphanedTaskIds.Clear(); var exceptions = new List(); // Run each servicing task serially and accumulate the exception, except whenever the pool or the job are not found @@ -597,11 +595,16 @@ await PerformTask(ServicePoolAsync(ServiceKind.ManagePoolScaling, cancellationTo await PerformTask(ServicePoolAsync(ServiceKind.Rotate, cancellationToken), cancellationToken) && await PerformTask(ServicePoolAsync(ServiceKind.RemovePoolIfEmpty, cancellationToken), cancellationToken); - return exceptions.Count switch + switch (exceptions.Count) { - 0 => _orphanedTaskIds, - 1 => throw exceptions.First(), - _ => throw new AggregateException(exceptions.SelectMany(Flatten)), + case 0: + return; + + case 1: + throw exceptions.First(); + + default: + throw new AggregateException(exceptions.SelectMany(Flatten)); }; static IEnumerable Flatten(Exception ex) @@ -670,8 +673,7 @@ async ValueTask ParseBatchException(BatchException batchException, Cancell { _logger.LogError(batchException, "Batch pool and/or job {PoolId} is missing. Removing them from TES's active pool list.", PoolId); _ = _batchPools.RemovePoolFromList(this); - AssociatedTesTasks.ForEach(entry => _orphanedTaskIds.Add(entry.Key)); - await _batchPools.DeletePoolAndJobAsync(this, cancellationToken); // TODO: Consider moving any remaining tasks to another pool, or failing job/tasks explicitly + await _batchPools.DeletePoolAndJobAsync(this, cancellationToken); return true; } diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 548879c72..e2e0af12a 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -218,6 +218,7 @@ async Task SetTaskStateAndLogAsync(TesTask tesTask, TesState newTaskState, var (batchNodeMetrics, taskStartTime, taskEndTime, cromwellRcCode) = newTaskState == TesState.COMPLETE ? await GetBatchNodeMetricsAndCromwellResultCodeAsync(tesTask, cancellationToken) : default; + var taskAsString = SerializeToString(tesTask); lock (setTaskStateLock) @@ -349,6 +350,12 @@ async Task TerminateBatchTaskAsync(TesTask tesTask, TesState newTaskState, return true; // It was never scheduled default: + if (!GetPools().Select(pool => pool.PoolId).Contains(tesTask.PoolId, StringComparer.OrdinalIgnoreCase)) + { + // Task was orphaned + return true; + } + await azureProxy.TerminateBatchTaskAsync(tesTask.Id, tesTask.PoolId, cancellationToken); return true; } @@ -357,6 +364,10 @@ async Task TerminateBatchTaskAsync(TesTask tesTask, TesState newTaskState, { return true; } + catch (BatchException exc) when (BatchErrorCodeStrings.JobNotFound.Equals(exc.RequestInformation?.BatchError?.Code, StringComparison.OrdinalIgnoreCase)) + { + return true; + } catch (Exception exc) { logger.LogError(exc, "Exception terminating batch task with tesTask.Id: {TesTaskId}", tesTask?.Id); diff --git a/src/TesApi.Web/IBatchPool.cs b/src/TesApi.Web/IBatchPool.cs index 5066b292d..5bc21ba0a 100644 --- a/src/TesApi.Web/IBatchPool.cs +++ b/src/TesApi.Web/IBatchPool.cs @@ -68,9 +68,8 @@ public interface IBatchPool /// Updates this instance based on changes to its environment. /// /// - /// s of orphaned tasks. /// Calls each internal servicing method in order. Throws all exceptions gathered from all methods. - ValueTask> ServicePoolAsync(CancellationToken cancellationToken = default); + ValueTask ServicePoolAsync(CancellationToken cancellationToken = default); /// /// Lists s running in pool's job. diff --git a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs index 01e4f373c..e59bab852 100644 --- a/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs +++ b/src/TesApi.Web/OrchestrateOnBatchSchedulerServiceBase.cs @@ -260,7 +260,7 @@ protected async ValueTask ProcessOrchestratedTesTaskAsync(string pollName, Relat { var currentTesTask = rce.RepositoryItem; - if (currentTesTask is not null) + if (currentTesTask == tesTask) { await requeue(rce); } @@ -330,7 +330,7 @@ protected async ValueTask OrchestrateTesTasksOnBatchAsync( if (BatchScheduler.NeedPoolFlush) { - var pools = (await Repository.GetItemsAsync(task => task.State == TesState.INITIALIZING || task.State == TesState.RUNNING, cancellationToken)).Select(task => task.PoolId).Distinct(); + var pools = (await Repository.GetItemsAsync(task => task.State == TesState.INITIALIZING || task.State == TesState.RUNNING, cancellationToken)).Select(task => task.PoolId).Distinct(StringComparer.OrdinalIgnoreCase); await BatchScheduler.FlushPoolsAsync(pools, cancellationToken); } diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index 393e21aaa..9395bb4f5 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -70,37 +70,7 @@ await ExecuteActionOnIntervalAsync( async token => await ExecuteActionOnPoolsAsync( async (pool, token) => { - var orphanedTaskIds = await pool.ServicePoolAsync(token); - List tasks = []; - - await orphanedTaskIds.ToAsyncEnumerable() - .ForEachAwaitWithCancellationAsync(async (id, cancellation) => await Repository.TryGetItemAsync(id, cancellation, task => - { - if (task.IsActiveState(preemptedIsTerminal: true) && pool.PoolId.Equals(task.PoolId, StringComparison.OrdinalIgnoreCase)) - { - tasks.Add(task); - } - }), token); - - var now = DateTimeOffset.UtcNow; - ConcurrentBag tasks2 = []; - - while (tasks.Count != 0) - { - await OrchestrateTesTasksOnBatchAsync( - $"OrphanedTasks ({pool.PoolId})", - _ => ValueTask.FromResult(tasks.ToAsyncEnumerable()), - (tesTasks, token) => TaskScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, tesTasks.Select(_ => new AzureBatchTaskState(AzureBatchTaskState.TaskState.CompletedWithErrors, BatchTaskEndTime: now, Failure: new("RemovedPoolOrJob", ["Batch pool or job was removed."]))).ToArray(), token), - ex => { Logger.LogError(ex, "Repository collision while failing task ('{TesTask}') due to pool or job removal.", ex.RepositoryItem?.Id ?? ""); tasks2.Add(ex.RepositoryItem); return ValueTask.CompletedTask; }, - cancellationToken); - - if (tasks.Count != 0) - { - tasks = [.. tasks2.Where(task => task is not null)]; - tasks2.Clear(); - } - } - + await pool.ServicePoolAsync(token); await ProcessTasksAsync(pool, DateTime.UtcNow, pool.ListCloudTasksAsync(), token); }, token), diff --git a/src/TesApi.Web/TaskScheduler.cs b/src/TesApi.Web/TaskScheduler.cs index 631de7ec9..d8786a002 100644 --- a/src/TesApi.Web/TaskScheduler.cs +++ b/src/TesApi.Web/TaskScheduler.cs @@ -59,6 +59,7 @@ internal class TaskScheduler(RunnerEventsProcessor nodeEventProcessor, Microsoft internal static readonly TimeSpan BatchRunInterval = TimeSpan.FromSeconds(30); // The very fastest processes inside of Azure Batch accessing anything within pools or jobs appears to use a 30 second polling interval private static readonly TimeSpan shortBackgroundRunInterval = TimeSpan.FromSeconds(1); private static readonly TimeSpan longBackgroundRunInterval = TimeSpan.FromSeconds(2.5); + private static readonly TimeSpan orphanedTaskInterval = TimeSpan.FromMinutes(10); private readonly RunnerEventsProcessor nodeEventProcessor = nodeEventProcessor; /// @@ -153,6 +154,7 @@ protected override async ValueTask ExecuteCoreAsync(CancellationToken cancellati queuedTasks.Add(ExecuteQueuedTesTasksOnBatchAsync(cancellationToken)); queuedTasks.Add(ExecuteCancelledTesTasksOnBatchAsync(cancellationToken)); queuedTasks.Add(ExecuteUpdateTesTaskFromEventBlobAsync(cancellationToken)); + queuedTasks.Add(ExecuteProcessOrphanedTasksAsync(cancellationToken)); if (cancellationToken.IsCancellationRequested) { @@ -265,7 +267,7 @@ private Task ExecuteCancelledTesTasksOnBatchAsync(CancellationToken cancellation requeues.Clear(); await OrchestrateTesTasksOnBatchAsync( "Cancelled", - query, + _ => ValueTask.FromResult(tasks.ToAsyncEnumerable()), (tasks, ct) => ((ITaskScheduler)this).ProcessTesTaskBatchStatesAsync( tasks, Enumerable.Repeat(new(AzureBatchTaskState.TaskState.CancellationRequested), tasks.Length).ToArray(), @@ -501,6 +503,29 @@ void ITaskScheduler.QueueTesTask(TesTask tesTask) queuedTesTasks.Enqueue(tesTask); } + private async Task ExecuteProcessOrphanedTasksAsync(CancellationToken cancellationToken) + { + List statesToSkip = [TesState.QUEUED, TesState.CANCELING]; + statesToSkip.AddRange(TesTask.TerminalStates); + + await ExecuteActionOnIntervalAsync(orphanedTaskInterval, + async token => + { + var pools = BatchScheduler.GetPools().Select(p => p.PoolId).ToArray(); + var now = DateTimeOffset.UtcNow; + + await OrchestrateTesTasksOnBatchAsync( + $"OrphanedTasks", + async cancellation => (await Repository.GetItemsAsync(task => !statesToSkip.Contains(task.State), cancellation)) + .Where(task => !pools.Contains(task.PoolId, StringComparer.OrdinalIgnoreCase)) + .ToAsyncEnumerable(), + (tesTasks, cancellation) => ((ITaskScheduler)this).ProcessTesTaskBatchStatesAsync(tesTasks, tesTasks.Select(_ => new AzureBatchTaskState(AzureBatchTaskState.TaskState.CompletedWithErrors, BatchTaskEndTime: now, Failure: new(AzureBatchTaskState.SystemError, ["RemovedPoolOrJob", "Batch pool or job was removed."]))).ToArray(), cancellation), + ex => { Logger.LogError(ex, "Repository collision while failing task ('{TesTask}') due to pool or job removal.", ex.RepositoryItem?.Id ?? ""); return ValueTask.CompletedTask; }, + token); + }, + cancellationToken); + } + /// IAsyncEnumerable> ITaskScheduler.ProcessTesTaskBatchStatesAsync(IEnumerable tesTasks, AzureBatchTaskState[] taskStates, CancellationToken cancellationToken) { From 2d0f68c4e9a15b007b0c1661eae4d953c4f7139c Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 6 Jan 2025 18:15:31 -0800 Subject: [PATCH 183/202] Address flaky tests --- .../PriceApiClientTests.cs | 44 +++++++++++++++++-- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/src/Tes.ApiClients.Tests/PriceApiClientTests.cs b/src/Tes.ApiClients.Tests/PriceApiClientTests.cs index 89603511f..9adbae7e2 100644 --- a/src/Tes.ApiClients.Tests/PriceApiClientTests.cs +++ b/src/Tes.ApiClients.Tests/PriceApiClientTests.cs @@ -35,7 +35,19 @@ public void Cleanup() [TestMethod] public async Task GetPricingInformationPageAsync_ReturnsSinglePageWithItemsWithMaxPageSize() { - var page = await pricingApiClient.GetPricingInformationPageAsync(DateTime.UtcNow, 0, "Virtual Machines", "westus2", CancellationToken.None); + Models.Pricing.RetailPricingData? page = default; + + try + { + page = await pricingApiClient.GetPricingInformationPageAsync(DateTime.UtcNow, 0, "Virtual Machines", "westus2", CancellationToken.None); + } + catch (HttpRequestException ex) + { + if (ex.StatusCode == System.Net.HttpStatusCode.TooManyRequests) + { + Assert.Inconclusive(); + } + } Assert.IsNotNull(page); Assert.IsTrue(page.Items.Length > 0); @@ -44,7 +56,19 @@ public async Task GetPricingInformationPageAsync_ReturnsSinglePageWithItemsWithM [TestMethod] public async Task GetPricingInformationAsync_ReturnsMoreThan100Items() { - var pages = await pricingApiClient.GetAllPricingInformationAsync("Virtual Machines", "westus2", CancellationToken.None).ToListAsync(); + List? pages = default; + + try + { + pages = await pricingApiClient.GetAllPricingInformationAsync("Virtual Machines", "westus2", CancellationToken.None).ToListAsync(); + } + catch (HttpRequestException ex) + { + if (ex.StatusCode == System.Net.HttpStatusCode.TooManyRequests) + { + Assert.Inconclusive(); + } + } Assert.IsNotNull(pages); Assert.IsTrue(pages.Count > 100); @@ -53,9 +77,21 @@ public async Task GetPricingInformationAsync_ReturnsMoreThan100Items() [TestMethod] public async Task GetAllPricingInformationForNonWindowsAndNonSpotVmsAsync_ReturnsOnlyNonWindowsAndNonSpotInstances() { - var pages = await pricingApiClient.GetAllPricingInformationForNonWindowsAndNonSpotVmsAsync("westus2", CancellationToken.None).ToListAsync(); + List? pages = default; + + try + { + pages = await pricingApiClient.GetAllPricingInformationForNonWindowsAndNonSpotVmsAsync("westus2", CancellationToken.None).ToListAsync(); + } + catch (HttpRequestException ex) + { + if (ex.StatusCode == System.Net.HttpStatusCode.TooManyRequests) + { + Assert.Inconclusive(); + } + } - Assert.IsTrue(pages.Count > 0); + Assert.IsTrue(pages?.Count > 0); Assert.IsFalse(pages.Any(r => r.productName.Contains(" Windows"))); Assert.IsFalse(pages.Any(r => r.productName.Contains(" Spot"))); } From f89c2ea38245997903dfc26e1ccb01e16974ad66 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 6 Jan 2025 18:25:00 -0800 Subject: [PATCH 184/202] Address CodeQL build failure --- src/CommonUtilities/AzureServicesConnectionStringCredential.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CommonUtilities/AzureServicesConnectionStringCredential.cs b/src/CommonUtilities/AzureServicesConnectionStringCredential.cs index aad87483a..725364d5d 100644 --- a/src/CommonUtilities/AzureServicesConnectionStringCredential.cs +++ b/src/CommonUtilities/AzureServicesConnectionStringCredential.cs @@ -71,7 +71,7 @@ private AzureServicesConnectionStringCredentialOptions() private void SetInitialState(AzureCloudConfig armEndpoints) { - (GetEnvironmentVariable("AZURE_ADDITIONALLY_ALLOWED_TENANTS") ?? string.Empty).Split([';'], StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ForEach(AdditionallyAllowedTenants.Add); + (GetEnvironmentVariable("AZURE_ADDITIONALLY_ALLOWED_TENANTS") ?? string.Empty).Split((char[]?)[';'], StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ForEach(AdditionallyAllowedTenants.Add); TenantId = GetEnvironmentVariable("AZURE_TENANT_ID")!; AuthorityHost = armEndpoints.AuthorityHost ?? new(armEndpoints.Authentication?.LoginEndpointUrl ?? throw new ArgumentException("AuthorityHost is missing", nameof(armEndpoints))); Audience = armEndpoints.ArmEnvironment?.Audience ?? armEndpoints.Authentication?.Audiences?.LastOrDefault() ?? throw new ArgumentException("Audience is missing", nameof(armEndpoints)); From f938f154372d6b441f8871a20c6bee94ca8d9bf9 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 6 Jan 2025 18:33:21 -0800 Subject: [PATCH 185/202] Address CodeQL build failure again --- src/CommonUtilities/AzureServicesConnectionStringCredential.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CommonUtilities/AzureServicesConnectionStringCredential.cs b/src/CommonUtilities/AzureServicesConnectionStringCredential.cs index 725364d5d..957fdf5a6 100644 --- a/src/CommonUtilities/AzureServicesConnectionStringCredential.cs +++ b/src/CommonUtilities/AzureServicesConnectionStringCredential.cs @@ -71,7 +71,7 @@ private AzureServicesConnectionStringCredentialOptions() private void SetInitialState(AzureCloudConfig armEndpoints) { - (GetEnvironmentVariable("AZURE_ADDITIONALLY_ALLOWED_TENANTS") ?? string.Empty).Split((char[]?)[';'], StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ForEach(AdditionallyAllowedTenants.Add); + (GetEnvironmentVariable("AZURE_ADDITIONALLY_ALLOWED_TENANTS") ?? string.Empty).Split(';', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ForEach(AdditionallyAllowedTenants.Add); TenantId = GetEnvironmentVariable("AZURE_TENANT_ID")!; AuthorityHost = armEndpoints.AuthorityHost ?? new(armEndpoints.Authentication?.LoginEndpointUrl ?? throw new ArgumentException("AuthorityHost is missing", nameof(armEndpoints))); Audience = armEndpoints.ArmEnvironment?.Audience ?? armEndpoints.Authentication?.Audiences?.LastOrDefault() ?? throw new ArgumentException("Audience is missing", nameof(armEndpoints)); From a20a375c9269afe96806cc0d9e04a7929918f34f Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 6 Jan 2025 18:43:02 -0800 Subject: [PATCH 186/202] Address CodeQL build failure redux --- .github/workflows/codeql.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 80ee3a988..14e2b1b17 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -60,8 +60,8 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). # If this step fails, then you should remove it and run the build manually (see below) - - name: Autobuild - uses: github/codeql-action/autobuild@v2 + # - name: Autobuild + # uses: github/codeql-action/autobuild@v2 # ℹ️ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -69,8 +69,9 @@ jobs: # If the Autobuild fails above, remove it and uncomment the following three lines. # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. - # - run: | - # echo "Run, Build Application using script" + - run: | + echo "Run, Build Application using script" + dotnet build # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis From caa62d2c903914bbcc4ea6a7375886a777e74a16 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 6 Jan 2025 18:48:57 -0800 Subject: [PATCH 187/202] Fix yaml syntax error --- .github/workflows/codeql.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 14e2b1b17..244640c16 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -69,10 +69,12 @@ jobs: # If the Autobuild fails above, remove it and uncomment the following three lines. # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. + # - run: | + # echo "Run, Build Application using script" + # ./location_of_script_within_repo/buildscript.sh + - run: | - echo "Run, Build Application using script" dotnet build - # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v2 From 36a03c6ca55271247d01608e5f29dbfe9dd05e3f Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 6 Jan 2025 18:51:46 -0800 Subject: [PATCH 188/202] Fix yaml syntax error redux --- .github/workflows/codeql.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 244640c16..960c05821 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -73,7 +73,7 @@ jobs: # echo "Run, Build Application using script" # ./location_of_script_within_repo/buildscript.sh - - run: | + - run: dotnet build - name: Perform CodeQL Analysis From 7ef9a513bf8b172d25fac5dd983a67330d8f89c3 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Mon, 6 Jan 2025 19:05:09 -0800 Subject: [PATCH 189/202] yaml syntax yet again --- .github/workflows/codeql.yml | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 960c05821..f19e18d73 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -57,11 +57,17 @@ jobs: # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs # queries: security-extended,security-and-quality + - name: Setup dotnet + uses: actions/setup-dotnet@v4 + with: + dotnet-version: | + 8 + 9 # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). # If this step fails, then you should remove it and run the build manually (see below) - # - name: Autobuild - # uses: github/codeql-action/autobuild@v2 + - name: Autobuild + uses: github/codeql-action/autobuild@v2 # ℹ️ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -73,9 +79,6 @@ jobs: # echo "Run, Build Application using script" # ./location_of_script_within_repo/buildscript.sh - - run: - dotnet build - - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v2 with: From 1c4f9e6c957f829387ff3a11a77fdb59ad7da59e Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Wed, 12 Feb 2025 17:03:56 -0800 Subject: [PATCH 190/202] Use GITHUB_TOKEN to limit GitHub API rate-limiting --- src/build-push-acr/AcrBuild.cs | 2 +- src/build-push-acr/GitHubArchive.cs | 23 +++++++++++++++++++++++ src/build-push-acr/Program.cs | 4 +++- src/deploy-tes-on-azure/Deployer.cs | 2 +- 4 files changed, 28 insertions(+), 3 deletions(-) diff --git a/src/build-push-acr/AcrBuild.cs b/src/build-push-acr/AcrBuild.cs index d3efe5e9b..69be2bdd3 100644 --- a/src/build-push-acr/AcrBuild.cs +++ b/src/build-push-acr/AcrBuild.cs @@ -72,7 +72,7 @@ public static IArchive GetGitHubArchive(BuildType buildType, string @ref, Micros ? new Microsoft.Kiota.Abstractions.Authentication.AnonymousAuthenticationProvider() : new GitHub.Octokit.Client.Authentication.TokenAuthProvider(tokenProvider)) .WithUserAgent("microsoft-ga4gh-tes", string.Empty) - .WithRequestTimeout(TimeSpan.FromSeconds(100)) + .WithRequestTimeout(TimeSpan.FromHours(1.5)) //.WithBaseUrl("https://api.github.com") .Build()), "microsoft", diff --git a/src/build-push-acr/GitHubArchive.cs b/src/build-push-acr/GitHubArchive.cs index 647d9252b..b33bb5455 100644 --- a/src/build-push-acr/GitHubArchive.cs +++ b/src/build-push-acr/GitHubArchive.cs @@ -4,6 +4,7 @@ using System.IO.Compression; using System.Net; using CommonUtilities; +using Microsoft.Kiota.Abstractions.Authentication; namespace BuildPushAcr { @@ -24,6 +25,28 @@ public sealed class GitHubArchive(GitHub.GitHubClient client, string owner, stri private string? srcRoot; private string? root; + public static IAccessTokenProvider? GetAccessTokenProvider() + { + var pat = Environment.GetEnvironmentVariable("GITHUB_TOKEN"); + + if (string.IsNullOrWhiteSpace(pat)) + { + return default; + } + + return new AccessTokenProvider(pat); + } + + private class AccessTokenProvider(string pat) : IAccessTokenProvider + { + private readonly string pat = pat; + + AllowedHostsValidator IAccessTokenProvider.AllowedHostsValidator { get; } = new(["api.github.com"]); + + Task IAccessTokenProvider.GetAuthorizationTokenAsync(Uri uri, Dictionary? additionalAuthenticationContext, CancellationToken cancellationToken) + => Task.FromResult(pat); + } + async ValueTask IArchive.GetTagAsync(CancellationToken cancellationToken) { List tags = []; diff --git a/src/build-push-acr/Program.cs b/src/build-push-acr/Program.cs index 19639314d..dd944d16f 100644 --- a/src/build-push-acr/Program.cs +++ b/src/build-push-acr/Program.cs @@ -100,7 +100,9 @@ IArchive GetLocalGitArchive(System.CommandLine.Invocation.InvocationContext cont => AcrBuild.GetLocalGitArchiveAsync(context.ParseResult.GetValueForOption(directoryOption)!); IArchive GetGitHubArchive(System.CommandLine.Invocation.InvocationContext context) - => AcrBuild.GetGitHubArchive(context.ParseResult.GetValueForOption(buildTypeOption), context.ParseResult.GetValueForOption(tagOption)!.ToString(3)); + => AcrBuild.GetGitHubArchive(context.ParseResult.GetValueForOption(buildTypeOption), + context.ParseResult.GetValueForOption(tagOption)!.ToString(3), + GitHubArchive.GetAccessTokenProvider()); githubCommand.SetHandler(context => Handler(context, GetGitHubArchive)); localCommand.SetHandler(context => Handler(context, GetLocalGitArchive)); diff --git a/src/deploy-tes-on-azure/Deployer.cs b/src/deploy-tes-on-azure/Deployer.cs index 87426f1b1..0720c3ac4 100644 --- a/src/deploy-tes-on-azure/Deployer.cs +++ b/src/deploy-tes-on-azure/Deployer.cs @@ -1335,7 +1335,7 @@ private async Task BuildPushAcrAsync(Dictionary settings, string if (string.IsNullOrWhiteSpace(configuration.SolutionDir)) { - tar = AcrBuild.GetGitHubArchive(BuildType.Tes, string.IsNullOrWhiteSpace(configuration.GitHubCommit) ? new Version(targetVersion).ToString(3) : configuration.GitHubCommit); + tar = AcrBuild.GetGitHubArchive(BuildType.Tes, string.IsNullOrWhiteSpace(configuration.GitHubCommit) ? new Version(targetVersion).ToString(3) : configuration.GitHubCommit, GitHubArchive.GetAccessTokenProvider()); tarDisposable = tar as IAsyncDisposable; } else From ebc62b8cd9e8754325243e0a0d99f0595de568ee Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 14 Feb 2025 21:25:29 -0800 Subject: [PATCH 191/202] Implement multiple executors results --- src/Tes.Runner/Events/EventsPublisher.cs | 16 ++++- src/Tes.SDK.Examples/Program.cs | 3 + src/Tes.SDK.Examples/TesExamples.cs | 26 ++++++++ src/Tes/Extensions/TesTaskExtensions.cs | 24 +++++-- src/TesApi.Web/AzureBatchTaskState.cs | 2 + src/TesApi.Web/BatchScheduler.cs | 48 ++++++++------ .../Events/RunnerEventsProcessor.cs | 65 ++++++++++++------- src/TesApi.Web/PoolScheduler.cs | 6 +- .../Properties/launchSettings.json | 3 +- 9 files changed, 136 insertions(+), 57 deletions(-) diff --git a/src/Tes.Runner/Events/EventsPublisher.cs b/src/Tes.Runner/Events/EventsPublisher.cs index bfda9896c..adfecec40 100644 --- a/src/Tes.Runner/Events/EventsPublisher.cs +++ b/src/Tes.Runner/Events/EventsPublisher.cs @@ -123,6 +123,10 @@ public virtual async Task PublishUploadEndEventAsync(NodeTask nodeTask, int numb await PublishAsync(eventMessage); } + private static string ExecutorFormatted(NodeTask nodeTask, int selector) + // Maintain format with TesApi.Web.Events.RunnerEventsProcessor.GetMessageBatchStateAsync+ParseExecutorIndex() + => $"{selector + 1}/{nodeTask.Executors?.Count ?? 0}"; + public virtual async Task PublishExecutorStartEventAsync(NodeTask nodeTask, int selector) { var eventMessage = CreateNewEventMessage(nodeTask.Id, ExecutorStartEvent, StartedStatus, @@ -133,7 +137,7 @@ public virtual async Task PublishExecutorStartEventAsync(NodeTask nodeTask, int eventMessage.EventData = new() { - { "executor", $"{selector + 1}/{nodeTask.Executors?.Count ?? 0}" }, + { "executor", ExecutorFormatted(nodeTask, selector) }, { "image", executor?.ImageName ?? string.Empty}, { "imageTag", executor?.ImageTag ?? string.Empty}, { "commands", string.Join(' ', commands) } @@ -150,7 +154,7 @@ public virtual async Task PublishExecutorEndEventAsync(NodeTask nodeTask, int se eventMessage.EventData = new() { - { "executor", $"{selector + 1}/{nodeTask.Executors?.Count ?? 0}" }, + { "executor", ExecutorFormatted(nodeTask, selector) }, { "image", executor?.ImageName ?? string.Empty}, { "imageTag", executor?.ImageTag ?? string.Empty}, { "exitCode", exitCode.ToString()}, @@ -243,8 +247,14 @@ public async Task FlushPublishersAsync(int waitTimeInSeconds = 60) await Task.WhenAll(stopTasks).WaitAsync(TimeSpan.FromSeconds(waitTimeInSeconds)); } - public async ValueTask DisposeAsync() + protected async virtual ValueTask DisposeAsyncCore() { await FlushPublishersAsync(); } + + public async ValueTask DisposeAsync() + { + await DisposeAsyncCore(); + GC.SuppressFinalize(this); + } } diff --git a/src/Tes.SDK.Examples/Program.cs b/src/Tes.SDK.Examples/Program.cs index e2392aa07..0ad3a2e96 100644 --- a/src/Tes.SDK.Examples/Program.cs +++ b/src/Tes.SDK.Examples/Program.cs @@ -43,6 +43,9 @@ private static async Task Main(string[] args) case "bwa": await tesExamples.RunBwaMemAsync(); break; + case "createtaskignoreerror": + await tesExamples.CreateTaskIgnoreError(); + break; default: Console.WriteLine("Unknown command. Please use 'prime' for RunPrimeSieveAsync or 'bwa' for RunBwaMemAsync."); break; diff --git a/src/Tes.SDK.Examples/TesExamples.cs b/src/Tes.SDK.Examples/TesExamples.cs index d54c1c271..388830fcc 100644 --- a/src/Tes.SDK.Examples/TesExamples.cs +++ b/src/Tes.SDK.Examples/TesExamples.cs @@ -120,6 +120,32 @@ public async Task RunBwaMemAsync() await RunTasks(new List { task }); } + internal async Task CreateTaskIgnoreError() + { + using ITesClient tesClient = new TesClient(_tesCredentials); + var task = await tesClient.CreateAndWaitTilDoneAsync(new TesTask() + { + Name = "CompTest", + Description = "CompTest", + Executors = + [ + new() + { + Image = "alpine", + Command = [ "ERROR" ], + IgnoreError = true + }, + new() + { + Image = "alpine", + Command = [ "echo", "hello" ] + } + ] + }); + + task = await tesClient.GetTaskAsync(task.Id, view: TesView.FULL); + } + private async Task RunTasks(List tasks) { var sw = Stopwatch.StartNew(); diff --git a/src/Tes/Extensions/TesTaskExtensions.cs b/src/Tes/Extensions/TesTaskExtensions.cs index 4886e88f0..5344fb9c5 100644 --- a/src/Tes/Extensions/TesTaskExtensions.cs +++ b/src/Tes/Extensions/TesTaskExtensions.cs @@ -133,17 +133,27 @@ public static Dictionary GetOrAddMetadata(this TesTaskLog tesTas /// Returns the last . Adds it if none exist. /// /// + /// Index of corresponding /// Initialized - public static TesExecutorLog GetOrAddExecutorLog(this TesTaskLog tesTaskLog) + public static TesExecutorLog GetOrAddExecutorLog(this TesTaskLog tesTaskLog, int? index = default) { - if ((tesTaskLog.Logs?.Count ?? 0) == 0) + if (index is null) { - TesExecutorLog log = new(); - tesTaskLog.Logs = [log]; - return log; + if ((tesTaskLog.Logs?.Count ?? 0) == 0) + { + TesExecutorLog log = new(); + tesTaskLog.Logs = [log]; + return log; + } + + return tesTaskLog.Logs.Last(); + } + else + { + tesTaskLog.Logs ??= []; + tesTaskLog.Logs.EnsureCapacity(index.Value); + return tesTaskLog.Logs[index.Value] ??= new(); } - - return tesTaskLog.Logs.Last(); } /// diff --git a/src/TesApi.Web/AzureBatchTaskState.cs b/src/TesApi.Web/AzureBatchTaskState.cs index 26a769047..2be3df9c5 100644 --- a/src/TesApi.Web/AzureBatchTaskState.cs +++ b/src/TesApi.Web/AzureBatchTaskState.cs @@ -17,6 +17,7 @@ namespace TesApi.Web /// Failure information. /// . /// . + /// index of referred to in this record. /// . /// . /// . @@ -29,6 +30,7 @@ public record class AzureBatchTaskState( FailureInformation? Failure = default, DateTimeOffset? CloudTaskCreationTime = default, DateTimeOffset? BatchTaskStartTime = default, + int? ExecutorIndex = default, DateTimeOffset? ExecutorStartTime = default, DateTimeOffset? ExecutorEndTime = default, int? ExecutorExitCode = default, diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index bacb4f64d..0c6e991f8 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -233,9 +233,9 @@ async Task SetTaskStateAndLogAsync(TesTask tesTask, TesState newTaskState, tesTaskLog.EndTime ??= batchInfo.BatchTaskEndTime ?? taskEndTime; tesTaskLog.StartTime ??= batchInfo.BatchTaskStartTime ?? taskStartTime; - if (batchInfo.ExecutorEndTime is not null || batchInfo.ExecutorStartTime is not null || batchInfo.ExecutorExitCode is not null) + if (batchInfo.ExecutorIndex is not null || batchInfo.ExecutorEndTime is not null || batchInfo.ExecutorStartTime is not null || batchInfo.ExecutorExitCode is not null) { - var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); + var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(batchInfo.ExecutorIndex); tesTaskExecutorLog.StartTime ??= batchInfo.ExecutorStartTime; tesTaskExecutorLog.EndTime ??= batchInfo.ExecutorEndTime; tesTaskExecutorLog.ExitCode ??= batchInfo.ExecutorExitCode; @@ -247,6 +247,13 @@ async Task SetTaskStateAndLogAsync(TesTask tesTask, TesState newTaskState, .Concat(DeserializeJsonStringArray(tesTaskExecutorLog.Stdout) ?? [])); } } + else if (!tesTask.IsActiveState(/*TODO*/) && (tesTaskLog.Logs?.Any(log => log.ExitCode is null || log.EndTime is null) ?? true)) + { + var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); + tesTaskExecutorLog.StartTime ??= batchInfo.BatchTaskStartTime ?? taskStartTime ?? tesTaskLog.StartTime; + tesTaskExecutorLog.EndTime ??= batchInfo.BatchTaskEndTime ?? taskEndTime ?? tesTaskLog.EndTime; + tesTaskExecutorLog.ExitCode ??= batchInfo.BatchTaskExitCode; + } if (batchInfo.OutputFileLogs is not null) { @@ -265,7 +272,7 @@ async Task SetTaskStateAndLogAsync(TesTask tesTask, TesState newTaskState, tesTask.SetWarning(warningInfo[0]); break; default: - tesTask.SetWarning(warningInfo[0], warningInfo.Skip(1).ToArray()); + tesTask.SetWarning(warningInfo[0], [.. warningInfo.Skip(1)]); break; } } @@ -274,10 +281,9 @@ async Task SetTaskStateAndLogAsync(TesTask tesTask, TesState newTaskState, { tesTask.SetFailureReason( batchInfo.Failure.Value.Reason, - (batchInfo.Failure.Value.SystemLogs ?? (string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem) + [.. (batchInfo.Failure.Value.SystemLogs ?? (string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem) ? [] - : Enumerable.Empty().Append(batchInfo.AlternateSystemLogItem)) - ).ToArray()); + : Enumerable.Empty().Append(batchInfo.AlternateSystemLogItem)))]); } else if (!(string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem) || tesTask.IsActiveState() || new[] { TesState.COMPLETE, TesState.CANCELED }.Contains(tesTask.State))) { @@ -458,9 +464,9 @@ internal static string CreateWgetDownloadCommand(Uri urlToDownload, string local /// /// Retrieves pools associated with this TES from the batch account. /// - /// A for controlling the lifetime of the asynchronous operation. /// - private IAsyncEnumerable GetCloudPools(CancellationToken cancellationToken) + ///// A for controlling the lifetime of the asynchronous operation. + private IAsyncEnumerable GetCloudPools(CancellationToken _1) => azureProxy.GetActivePoolsAsync(batchPrefix); private Lazy _loadExistingPools = null; @@ -791,7 +797,7 @@ private static void QueuedTesTaskAddTaskEntryToQueueFromDirectory( true => dictionary.TryRemove(new(key, refValue)), // Attempt queue removal from dictionary false => dictionary.TryUpdate(key, new(timer, queue), refValue), // Attempt dictionary update with items remaining in the queue }); - queue = ImmutableQueue.CreateRange(refValue.Queue.WhereNot(tasks.Contains))) // get the list of items that were not "enqueued" + queue = [.. refValue.Queue.WhereNot(tasks.Contains)]) // get the list of items that were not "enqueued" { refValue = dictionary[key]; // get the current dictionary value for the key } @@ -837,7 +843,7 @@ private static async Task PerformTaskAsync(Func me } catch (Exception exception) { - taskCompletions = (taskCompletions ?? []).ToList(); + taskCompletions = [.. (taskCompletions ?? [])]; if (taskCompletions.Any()) { @@ -864,7 +870,7 @@ private static async Task PerformTaskOfTAsync(Func result) modelPoolFactory: async (id, ct) => await GetPoolSpecification( name: id, displayName: pool.VirtualMachineInfo.PoolDisplayName, - poolIdentity: GetBatchPoolIdentity(pool.VirtualMachineInfo.Identities.WhereNot(string.IsNullOrWhiteSpace).ToList()), + poolIdentity: GetBatchPoolIdentity([.. pool.VirtualMachineInfo.Identities.WhereNot(string.IsNullOrWhiteSpace)]), vmInfo: pool.VirtualMachineInfo, initialTarget: pool.InitialTarget, nodeInfo: (useGen2 ?? false) ? gen2BatchNodeInfo : gen1BatchNodeInfo, @@ -1093,9 +1099,11 @@ private async Task ConvertTesTaskToBatchTaskUsingRunnerAsync(string t { Constraints = new(maxWallClockTime: taskMaxWallClockTime, retentionTime: TimeSpan.Zero, maxTaskRetryCount: 0), UserIdentity = new(new AutoUserSpecification(elevationLevel: ElevationLevel.Admin, scope: AutoUserScope.Pool)), - EnvironmentSettings = assets.Environment.Select(pair => new EnvironmentSetting(pair.Key, pair.Value)) - .Concat(Enumerable.Repeat(new("DEBUG_DELAY", debugDelay?.ToString("c")), debugDelay is null ? 0 : 1)) - .ToList(), + EnvironmentSettings = + [ + .. assets.Environment.Select(pair => new EnvironmentSetting(pair.Key, pair.Value)), + .. Enumerable.Repeat(new("DEBUG_DELAY", debugDelay?.ToString("c")), debugDelay is null ? 0 : 1), + ], }; } @@ -1151,7 +1159,7 @@ private async ValueTask> GetExistingBlobsInCromwellStorageLocatio if (commandScript is not null) { - return blobsInExecutionDirectory + return [.. blobsInExecutionDirectory .Select(b => (Path: $"/{metadata.CromwellExecutionDir.TrimStart('/')}/{b.BlobName.Split('/').Last()}", Uri: new BlobUriBuilder(executionDirectoryUri) { BlobName = b.BlobName }.ToUri())) .Select(b => new TesInput @@ -1160,8 +1168,7 @@ private async ValueTask> GetExistingBlobsInCromwellStorageLocatio Url = b.Uri.AbsoluteUri, Name = Path.GetFileName(b.Path), Type = TesFileType.FILE - }) - .ToList(); + })]; } } @@ -1500,12 +1507,11 @@ internal async Task GetVmSizeAsync(TesTa if (!string.IsNullOrWhiteSpace(vmSize)) { - eligibleVms = virtualMachineInfoList + eligibleVms = [.. virtualMachineInfoList .Where(vm => vm.LowPriority == preemptible && vm.VmSize.Equals(vmSize, StringComparison.OrdinalIgnoreCase)) - .Select(vm => new(vm, [])) - .ToList(); + .Select(vm => new(vm, []))]; noVmFoundMessage = $"No VM (out of {virtualMachineInfoList.Count}) available with the required resources (vmsize: {vmSize}, preemptible: {preemptible}) for task id {tesTask.Id}."; } diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index a64f538d6..45b06171d 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -238,26 +238,32 @@ public IEnumerable OrderProcessedByExecutorSequence(IEnumerable source, ArgumentNullException.ThrowIfNull(source); ArgumentNullException.ThrowIfNull(messageGetter); - return source.OrderBy(OrderByT).ThenBy(ThenByT); + return source + .OrderBy(new OrderByAdapter(OrderByCreated, messageGetter).OrderBy) + .ThenBy(new OrderByAdapter(ThenByName, messageGetter).OrderBy) + .ThenBy(new OrderByAdapter(ThenByIndex, messageGetter).OrderBy); - DateTime OrderByT(T item) - => OrderBy(messageGetter(item)); - - int ThenByT(T item) - => ThenBy(messageGetter(item)); - - static DateTime OrderBy(RunnerEventsMessage message) + static DateTime OrderByCreated(RunnerEventsMessage message) => (message.RunnerEventMessage?.Created ?? DateTime.Parse(message.Tags["created"])).ToUniversalTime(); - static int ThenBy(RunnerEventsMessage message) + static int ThenByName(RunnerEventsMessage message) => ParseEventName(message.RunnerEventMessage is null ? message.Tags["event-name"] : message.RunnerEventMessage.Name); + static int ThenByIndex(RunnerEventsMessage message) + => int.Parse(message.RunnerEventMessage?.EventData.FirstOrDefault(p => "executor".Equals(p.Key)).Value ?? "0"); + static int ParseEventName(string eventName) => EventsInOrder.TryGetValue(eventName, out var result) ? result : 0; } + private readonly struct OrderByAdapter(Func adapter, Func messageGetter) + { + public TOrder OrderBy(TItem item) + => adapter(messageGetter(item)); + } + /// /// Gets the task status details from this event message. /// @@ -295,26 +301,30 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes _ => throw new System.Diagnostics.UnreachableException(), }, - Tes.Runner.Events.EventsPublisher.ExecutorStartEvent => new(AzureBatchTaskState.TaskState.Running, + Tes.Runner.Events.EventsPublisher.ExecutorStartEvent => new( + AzureBatchTaskState.TaskState.Running, + ExecutorIndex: ParseExecutorIndex(nodeMessage.EventData), ExecutorStartTime: nodeMessage.Created), Tes.Runner.Events.EventsPublisher.ExecutorEndEvent => nodeMessage.StatusMessage switch { Tes.Runner.Events.EventsPublisher.SuccessStatus => await new AzureBatchTaskState( AzureBatchTaskState.TaskState.InfoUpdate, + ExecutorIndex: ParseExecutorIndex(nodeMessage.EventData), ExecutorEndTime: nodeMessage.Created, ExecutorExitCode: int.Parse(nodeMessage.EventData["exitCode"])) - .WithActionAsync(() => AddProcessLogsAsync(tesTask, cancellationToken)), + .WithActionAsync(state => AddProcessLogsAsync(tesTask, state.ExecutorIndex ?? -1, cancellationToken)), Tes.Runner.Events.EventsPublisher.FailedStatus => await new AzureBatchTaskState( AzureBatchTaskState.TaskState.InfoUpdate, + ExecutorIndex: ParseExecutorIndex(nodeMessage.EventData), Failure: new(AzureBatchTaskState.ExecutorError, Enumerable.Empty() .Append(nodeMessage.EventData["errorMessage"]) .Concat(await AddProcessLogsIfAvailableAsync(nodeMessage, tesTask, cancellationToken))), ExecutorEndTime: nodeMessage.Created, ExecutorExitCode: int.Parse(nodeMessage.EventData["exitCode"])) - .WithActionAsync(() => AddProcessLogsAsync(tesTask, cancellationToken)), + .WithActionAsync(state => AddProcessLogsAsync(tesTask, state.ExecutorIndex ?? -1, cancellationToken)), _ => throw new System.Diagnostics.UnreachableException(), }, @@ -359,6 +369,13 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes }; // Helpers + static int? ParseExecutorIndex(IDictionary eventData) + // Maintain format with TesApi.Web.Events.RunnerEventsProcessor.ExecutorFormatted() + => (eventData?.TryGetValue("executor", out var value) ?? false) && + int.TryParse(value.Split('/', 2, StringSplitOptions.TrimEntries)[0], out var result) + ? result + : default; + static IEnumerable GetOutputFileLogs(IDictionary eventData) { if (eventData is null || !eventData.TryGetValue("fileLog-Count", out var fileCount)) @@ -379,7 +396,7 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes async ValueTask> AddProcessLogsIfAvailableAsync(Tes.Runner.Events.EventMessage message, Tes.Models.TesTask tesTask, CancellationToken cancellationToken) { - var processLogs = await GetProcessLogsAsync(message.Name, tesTask, cancellationToken).ToListAsync(cancellationToken); + var processLogs = await GetProcessLogsAsync(message, tesTask, cancellationToken).ToListAsync(cancellationToken); if (processLogs.Any()) { @@ -389,12 +406,12 @@ async ValueTask> AddProcessLogsIfAvailableAsync(Tes.Runner.E return processLogs; } - IAsyncEnumerable GetProcessLogsAsync(string messageName, Tes.Models.TesTask tesTask, CancellationToken cancellationToken) + IAsyncEnumerable GetProcessLogsAsync(Tes.Runner.Events.EventMessage message, Tes.Models.TesTask tesTask, CancellationToken cancellationToken) { - var blobNameStartsWith = messageName switch + var blobNameStartsWith = message.Name switch { Tes.Runner.Events.EventsPublisher.DownloadEndEvent => "download_std", - Tes.Runner.Events.EventsPublisher.ExecutorEndEvent => "exec_std", + Tes.Runner.Events.EventsPublisher.ExecutorEndEvent => $"exec-{ParseExecutorIndex(message.EventData):D3}_std", Tes.Runner.Events.EventsPublisher.UploadEndEvent => "upload_std", _ => string.Empty, }; @@ -430,14 +447,18 @@ IAsyncEnumerable GetProcessLogsAsync(string messageName, Tes.Models.TesT } } - async ValueTask AddProcessLogsAsync(Tes.Models.TesTask tesTask, CancellationToken cancellationToken) + async ValueTask AddProcessLogsAsync(Tes.Models.TesTask tesTask, int index, CancellationToken cancellationToken) { + if (index < 0) + { + return; + } + var stderr = Enumerable.Empty(); var stdout = Enumerable.Empty(); - await foreach (var (uri, label) in GetAvailableProcessLogsAsync("task-executor-1", tesTask, cancellationToken).WithCancellation(cancellationToken)) + await foreach (var (uri, label) in GetAvailableProcessLogsAsync($"task-executor-{index:D}_std", tesTask, cancellationToken).WithCancellation(cancellationToken)) { - switch (label) { case "stderr": @@ -455,7 +476,7 @@ async ValueTask AddProcessLogsAsync(Tes.Models.TesTask tesTask, CancellationToke if (stderr.Any() || stdout.Any()) { - var log = tesTask.GetOrAddTesTaskLog().GetOrAddExecutorLog(); + var log = tesTask.GetOrAddTesTaskLog().GetOrAddExecutorLog(index); if (stderr.Any()) { @@ -528,9 +549,9 @@ public DownloadOrParseException(string message, Exception exception) : base(mess internal static partial class Extensions { - public static async ValueTask WithActionAsync(this AzureBatchTaskState state, Func action) + public static async ValueTask WithActionAsync(this AzureBatchTaskState state, Func action) { - await action(); + await action(state); return state; } } diff --git a/src/TesApi.Web/PoolScheduler.cs b/src/TesApi.Web/PoolScheduler.cs index 9395bb4f5..c89350632 100644 --- a/src/TesApi.Web/PoolScheduler.cs +++ b/src/TesApi.Web/PoolScheduler.cs @@ -193,7 +193,7 @@ await Parallel.ForEachAsync(states, cancellationToken, async (state, token) => await OrchestrateTesTasksOnBatchAsync( $"NodeState ({poolId})", _ => ValueTask.FromResult(tasks.ToAsyncEnumerable()), - (tesTasks, token) => TaskScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, tesTasks.Select(task => statesByTask[task.Id]).ToArray(), token), + (tesTasks, token) => TaskScheduler.ProcessTesTaskBatchStatesAsync(tesTasks, [.. tesTasks.Select(task => statesByTask[task.Id])], token), ex => { requeues.Add(ex.RepositoryItem.Id); return ValueTask.CompletedTask; }, cancellationToken); // Fetch updated TesTasks from the repository @@ -267,10 +267,10 @@ private async IAsyncEnumerable GetCloudTaskStatesAsync( { var tasksWithNodeIds = tasks.ToList(); - taskListWithComputeNodeInfo = tasksWithNodeIds.Where(task => !string.IsNullOrWhiteSpace(task.PreviousComputeNodeId)).ToList(); + taskListWithComputeNodeInfo = [.. tasksWithNodeIds.Where(task => !string.IsNullOrWhiteSpace(task.PreviousComputeNodeId))]; var taskList = tasksWithNodeIds.Select(task => task.CloudTask).ToList(); activeTaskList = [.. taskList.Where(ActiveTaskListPredicate).OrderByDescending(task => task.StateTransitionTime?.ToUniversalTime())]; - completedTaskList = taskList.Where(task => CompletedTaskListPredicate(task, now)).ToList(); + completedTaskList = [.. taskList.Where(task => CompletedTaskListPredicate(task, now))]; } if (taskListWithComputeNodeInfo.Count > 0) diff --git a/src/deploy-tes-on-azure/Properties/launchSettings.json b/src/deploy-tes-on-azure/Properties/launchSettings.json index 48d66dde6..be46bdff7 100644 --- a/src/deploy-tes-on-azure/Properties/launchSettings.json +++ b/src/deploy-tes-on-azure/Properties/launchSettings.json @@ -2,7 +2,8 @@ "profiles": { "Setup": { "commandName": "Project", - "commandLineArgs": "--subscriptionid aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee --regionname westus2 --mainidentifierprefix toatest --AadGroupIds aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee", + "commandLineArgs": "--SubscriptionId 594bef42-33f3-42df-b056-e7399d6ae7a0 --HelmBinaryPath \"C:\\Users\\v-murriblair\\AppData\\Local\\Microsoft\\WinGet\\Packages\\Helm.Helm_Microsoft.Winget.Source_8wekyb3d8bbwe\\windows-amd64\\helm.exe\" --DebugLogging true --KubectlBinaryPath \"C:\\Users\\v-murriblair\\AppData\\Local\\Microsoft\\WinGet\\Packages\\Kubernetes.kubectl_Microsoft.Winget.Source_8wekyb3d8bbwe\\kubectl.exe\" --Update True --ResourceGroupName bmtwe-24c50640 --SolutionDir D:\\source\\repos\\alt\\ga4gh-tes", + "workingDirectory": "D:\\source\\deploy\\Terra\\westeurope\\bmtwe", "sqlDebugging": true } } From fd3636b360b1222afd95925bd794b468e377fde0 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 14 Feb 2025 21:34:05 -0800 Subject: [PATCH 192/202] revert file --- src/deploy-tes-on-azure/Properties/launchSettings.json | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/deploy-tes-on-azure/Properties/launchSettings.json b/src/deploy-tes-on-azure/Properties/launchSettings.json index be46bdff7..f5d6f1e4b 100644 --- a/src/deploy-tes-on-azure/Properties/launchSettings.json +++ b/src/deploy-tes-on-azure/Properties/launchSettings.json @@ -2,9 +2,8 @@ "profiles": { "Setup": { "commandName": "Project", - "commandLineArgs": "--SubscriptionId 594bef42-33f3-42df-b056-e7399d6ae7a0 --HelmBinaryPath \"C:\\Users\\v-murriblair\\AppData\\Local\\Microsoft\\WinGet\\Packages\\Helm.Helm_Microsoft.Winget.Source_8wekyb3d8bbwe\\windows-amd64\\helm.exe\" --DebugLogging true --KubectlBinaryPath \"C:\\Users\\v-murriblair\\AppData\\Local\\Microsoft\\WinGet\\Packages\\Kubernetes.kubectl_Microsoft.Winget.Source_8wekyb3d8bbwe\\kubectl.exe\" --Update True --ResourceGroupName bmtwe-24c50640 --SolutionDir D:\\source\\repos\\alt\\ga4gh-tes", - "workingDirectory": "D:\\source\\deploy\\Terra\\westeurope\\bmtwe", + "commandLineArgs": "--subscriptionid aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee --regionname westus2 --mainidentifierprefix toatest --AadGroupIds aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee", "sqlDebugging": true } } -} \ No newline at end of file +} From e245ce23adfeebe452611c7ea986e875d9001332 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 14 Feb 2025 22:07:58 -0800 Subject: [PATCH 193/202] cleanup --- src/TesApi.Web/BatchScheduler.cs | 37 ++++++++++++++++---------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index bacb4f64d..f382d5a7b 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -265,7 +265,7 @@ async Task SetTaskStateAndLogAsync(TesTask tesTask, TesState newTaskState, tesTask.SetWarning(warningInfo[0]); break; default: - tesTask.SetWarning(warningInfo[0], warningInfo.Skip(1).ToArray()); + tesTask.SetWarning(warningInfo[0], [.. warningInfo.Skip(1)]); break; } } @@ -274,10 +274,9 @@ async Task SetTaskStateAndLogAsync(TesTask tesTask, TesState newTaskState, { tesTask.SetFailureReason( batchInfo.Failure.Value.Reason, - (batchInfo.Failure.Value.SystemLogs ?? (string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem) + ([.. batchInfo.Failure.Value.SystemLogs ?? (string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem) ? [] - : Enumerable.Empty().Append(batchInfo.AlternateSystemLogItem)) - ).ToArray()); + : Enumerable.Empty().Append(batchInfo.AlternateSystemLogItem))])); } else if (!(string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem) || tesTask.IsActiveState() || new[] { TesState.COMPLETE, TesState.CANCELED }.Contains(tesTask.State))) { @@ -458,9 +457,9 @@ internal static string CreateWgetDownloadCommand(Uri urlToDownload, string local /// /// Retrieves pools associated with this TES from the batch account. /// - /// A for controlling the lifetime of the asynchronous operation. /// - private IAsyncEnumerable GetCloudPools(CancellationToken cancellationToken) + ///// A for controlling the lifetime of the asynchronous operation. + private IAsyncEnumerable GetCloudPools(CancellationToken _1) => azureProxy.GetActivePoolsAsync(batchPrefix); private Lazy _loadExistingPools = null; @@ -791,7 +790,7 @@ private static void QueuedTesTaskAddTaskEntryToQueueFromDirectory( true => dictionary.TryRemove(new(key, refValue)), // Attempt queue removal from dictionary false => dictionary.TryUpdate(key, new(timer, queue), refValue), // Attempt dictionary update with items remaining in the queue }); - queue = ImmutableQueue.CreateRange(refValue.Queue.WhereNot(tasks.Contains))) // get the list of items that were not "enqueued" + queue = [.. refValue.Queue.WhereNot(tasks.Contains)]) // get the list of items that were not "enqueued" { refValue = dictionary[key]; // get the current dictionary value for the key } @@ -837,7 +836,7 @@ private static async Task PerformTaskAsync(Func me } catch (Exception exception) { - taskCompletions = (taskCompletions ?? []).ToList(); + taskCompletions = [.. (taskCompletions ?? [])]; if (taskCompletions.Any()) { @@ -864,7 +863,7 @@ private static async Task PerformTaskOfTAsync(Func result) modelPoolFactory: async (id, ct) => await GetPoolSpecification( name: id, displayName: pool.VirtualMachineInfo.PoolDisplayName, - poolIdentity: GetBatchPoolIdentity(pool.VirtualMachineInfo.Identities.WhereNot(string.IsNullOrWhiteSpace).ToList()), + poolIdentity: GetBatchPoolIdentity([.. pool.VirtualMachineInfo.Identities.WhereNot(string.IsNullOrWhiteSpace)]), vmInfo: pool.VirtualMachineInfo, initialTarget: pool.InitialTarget, nodeInfo: (useGen2 ?? false) ? gen2BatchNodeInfo : gen1BatchNodeInfo, @@ -1093,9 +1092,11 @@ private async Task ConvertTesTaskToBatchTaskUsingRunnerAsync(string t { Constraints = new(maxWallClockTime: taskMaxWallClockTime, retentionTime: TimeSpan.Zero, maxTaskRetryCount: 0), UserIdentity = new(new AutoUserSpecification(elevationLevel: ElevationLevel.Admin, scope: AutoUserScope.Pool)), - EnvironmentSettings = assets.Environment.Select(pair => new EnvironmentSetting(pair.Key, pair.Value)) - .Concat(Enumerable.Repeat(new("DEBUG_DELAY", debugDelay?.ToString("c")), debugDelay is null ? 0 : 1)) - .ToList(), + EnvironmentSettings = + [ + .. assets.Environment.Select(pair => new EnvironmentSetting(pair.Key, pair.Value)), + .. Enumerable.Repeat(new("DEBUG_DELAY", debugDelay?.ToString("c")), debugDelay is null ? 0 : 1), + ], }; } @@ -1151,7 +1152,7 @@ private async ValueTask> GetExistingBlobsInCromwellStorageLocatio if (commandScript is not null) { - return blobsInExecutionDirectory + return [.. blobsInExecutionDirectory .Select(b => (Path: $"/{metadata.CromwellExecutionDir.TrimStart('/')}/{b.BlobName.Split('/').Last()}", Uri: new BlobUriBuilder(executionDirectoryUri) { BlobName = b.BlobName }.ToUri())) .Select(b => new TesInput @@ -1160,8 +1161,7 @@ private async ValueTask> GetExistingBlobsInCromwellStorageLocatio Url = b.Uri.AbsoluteUri, Name = Path.GetFileName(b.Path), Type = TesFileType.FILE - }) - .ToList(); + })]; } } @@ -1500,12 +1500,11 @@ internal async Task GetVmSizeAsync(TesTa if (!string.IsNullOrWhiteSpace(vmSize)) { - eligibleVms = virtualMachineInfoList + eligibleVms = [.. virtualMachineInfoList .Where(vm => vm.LowPriority == preemptible && vm.VmSize.Equals(vmSize, StringComparison.OrdinalIgnoreCase)) - .Select(vm => new(vm, [])) - .ToList(); + .Select(vm => new(vm, []))]; noVmFoundMessage = $"No VM (out of {virtualMachineInfoList.Count}) available with the required resources (vmsize: {vmSize}, preemptible: {preemptible}) for task id {tesTask.Id}."; } From db47d50b79cb71abe6d34b5de4f148a51ad66639 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 14 Feb 2025 22:17:46 -0800 Subject: [PATCH 194/202] fix mistype --- src/TesApi.Web/BatchScheduler.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index f382d5a7b..b436a4d23 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -274,9 +274,9 @@ async Task SetTaskStateAndLogAsync(TesTask tesTask, TesState newTaskState, { tesTask.SetFailureReason( batchInfo.Failure.Value.Reason, - ([.. batchInfo.Failure.Value.SystemLogs ?? (string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem) + [.. (batchInfo.Failure.Value.SystemLogs ?? (string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem) ? [] - : Enumerable.Empty().Append(batchInfo.AlternateSystemLogItem))])); + : Enumerable.Empty().Append(batchInfo.AlternateSystemLogItem)))]); } else if (!(string.IsNullOrWhiteSpace(batchInfo.AlternateSystemLogItem) || tesTask.IsActiveState() || new[] { TesState.COMPLETE, TesState.CANCELED }.Contains(tesTask.State))) { From a801b276bdbcee49331aead71dbe32710cab81a2 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Fri, 14 Feb 2025 22:22:01 -0800 Subject: [PATCH 195/202] remove incomplete example --- src/Tes.SDK.Examples/Program.cs | 3 --- src/Tes.SDK.Examples/TesExamples.cs | 26 -------------------------- 2 files changed, 29 deletions(-) diff --git a/src/Tes.SDK.Examples/Program.cs b/src/Tes.SDK.Examples/Program.cs index 0ad3a2e96..e2392aa07 100644 --- a/src/Tes.SDK.Examples/Program.cs +++ b/src/Tes.SDK.Examples/Program.cs @@ -43,9 +43,6 @@ private static async Task Main(string[] args) case "bwa": await tesExamples.RunBwaMemAsync(); break; - case "createtaskignoreerror": - await tesExamples.CreateTaskIgnoreError(); - break; default: Console.WriteLine("Unknown command. Please use 'prime' for RunPrimeSieveAsync or 'bwa' for RunBwaMemAsync."); break; diff --git a/src/Tes.SDK.Examples/TesExamples.cs b/src/Tes.SDK.Examples/TesExamples.cs index 388830fcc..d54c1c271 100644 --- a/src/Tes.SDK.Examples/TesExamples.cs +++ b/src/Tes.SDK.Examples/TesExamples.cs @@ -120,32 +120,6 @@ public async Task RunBwaMemAsync() await RunTasks(new List { task }); } - internal async Task CreateTaskIgnoreError() - { - using ITesClient tesClient = new TesClient(_tesCredentials); - var task = await tesClient.CreateAndWaitTilDoneAsync(new TesTask() - { - Name = "CompTest", - Description = "CompTest", - Executors = - [ - new() - { - Image = "alpine", - Command = [ "ERROR" ], - IgnoreError = true - }, - new() - { - Image = "alpine", - Command = [ "echo", "hello" ] - } - ] - }); - - task = await tesClient.GetTaskAsync(task.Id, view: TesView.FULL); - } - private async Task RunTasks(List tasks) { var sw = Stopwatch.StartNew(); From d4d47edbb26d62e8ff029e992bb4ee59cb0c18dc Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Sat, 15 Feb 2025 02:34:27 -0800 Subject: [PATCH 196/202] Fix executor log locator and address various exceptions --- src/Tes.RunnerCLI/Tes.RunnerCLI.csproj | 1 + src/Tes/Extensions/TesTaskExtensions.cs | 23 ++++++------------- src/TesApi.Tests/BatchSchedulerTests.cs | 12 ++++++++++ src/TesApi.Web/BatchScheduler.cs | 11 ++------- .../Events/RunnerEventsProcessor.cs | 18 +++++++-------- src/TesApi.Web/IBatchScheduler.cs | 4 ++-- src/TesApi.Web/TesApi.Web.csproj | 1 + 7 files changed, 34 insertions(+), 36 deletions(-) diff --git a/src/Tes.RunnerCLI/Tes.RunnerCLI.csproj b/src/Tes.RunnerCLI/Tes.RunnerCLI.csproj index 854f60d5e..329137a91 100644 --- a/src/Tes.RunnerCLI/Tes.RunnerCLI.csproj +++ b/src/Tes.RunnerCLI/Tes.RunnerCLI.csproj @@ -26,6 +26,7 @@ net8.0 enable enable + true tes-runner linux-x64 true diff --git a/src/Tes/Extensions/TesTaskExtensions.cs b/src/Tes/Extensions/TesTaskExtensions.cs index 5344fb9c5..5db775e50 100644 --- a/src/Tes/Extensions/TesTaskExtensions.cs +++ b/src/Tes/Extensions/TesTaskExtensions.cs @@ -135,25 +135,16 @@ public static Dictionary GetOrAddMetadata(this TesTaskLog tesTas /// /// Index of corresponding /// Initialized - public static TesExecutorLog GetOrAddExecutorLog(this TesTaskLog tesTaskLog, int? index = default) + public static TesExecutorLog GetOrAddExecutorLog(this TesTaskLog tesTaskLog, int index) { - if (index is null) - { - if ((tesTaskLog.Logs?.Count ?? 0) == 0) - { - TesExecutorLog log = new(); - tesTaskLog.Logs = [log]; - return log; - } - - return tesTaskLog.Logs.Last(); - } - else + tesTaskLog.Logs ??= []; + + while (tesTaskLog.Logs.Count <= index) { - tesTaskLog.Logs ??= []; - tesTaskLog.Logs.EnsureCapacity(index.Value); - return tesTaskLog.Logs[index.Value] ??= new(); + tesTaskLog.Logs.Add(null); } + + return tesTaskLog.Logs[index] ??= new(); } /// diff --git a/src/TesApi.Tests/BatchSchedulerTests.cs b/src/TesApi.Tests/BatchSchedulerTests.cs index dba670dcd..db46c15a1 100644 --- a/src/TesApi.Tests/BatchSchedulerTests.cs +++ b/src/TesApi.Tests/BatchSchedulerTests.cs @@ -35,6 +35,18 @@ public class BatchSchedulerTests { private const string GlobalManagedIdentity = "/subscriptions/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee/resourceGroups/SomeResourceGroup/providers/Microsoft.ManagedIdentity/userAssignedIdentities/GlobalManagedIdentity"; + [TestMethod] + public void GetOrAddExecutorLogCreatesLogAtIndex() + { + var index = 3; + TesTask tesTask = new() { Logs = [new()] }; + + var log = tesTask.Logs.Last().GetOrAddExecutorLog(index); + + Assert.IsNotNull(log); + Assert.AreSame(log, tesTask.Logs.Last().Logs[index]); + } + [TestMethod] public async Task LocalPoolCacheAccessesNewPoolsAfterAllPoolsRemovedWithSameKey() { diff --git a/src/TesApi.Web/BatchScheduler.cs b/src/TesApi.Web/BatchScheduler.cs index 0c6e991f8..d02c2dfe2 100644 --- a/src/TesApi.Web/BatchScheduler.cs +++ b/src/TesApi.Web/BatchScheduler.cs @@ -233,9 +233,9 @@ async Task SetTaskStateAndLogAsync(TesTask tesTask, TesState newTaskState, tesTaskLog.EndTime ??= batchInfo.BatchTaskEndTime ?? taskEndTime; tesTaskLog.StartTime ??= batchInfo.BatchTaskStartTime ?? taskStartTime; - if (batchInfo.ExecutorIndex is not null || batchInfo.ExecutorEndTime is not null || batchInfo.ExecutorStartTime is not null || batchInfo.ExecutorExitCode is not null) + if (batchInfo.ExecutorIndex is not null && (batchInfo.ExecutorEndTime is not null || batchInfo.ExecutorStartTime is not null || batchInfo.ExecutorExitCode is not null)) { - var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(batchInfo.ExecutorIndex); + var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(batchInfo.ExecutorIndex.Value); tesTaskExecutorLog.StartTime ??= batchInfo.ExecutorStartTime; tesTaskExecutorLog.EndTime ??= batchInfo.ExecutorEndTime; tesTaskExecutorLog.ExitCode ??= batchInfo.ExecutorExitCode; @@ -247,13 +247,6 @@ async Task SetTaskStateAndLogAsync(TesTask tesTask, TesState newTaskState, .Concat(DeserializeJsonStringArray(tesTaskExecutorLog.Stdout) ?? [])); } } - else if (!tesTask.IsActiveState(/*TODO*/) && (tesTaskLog.Logs?.Any(log => log.ExitCode is null || log.EndTime is null) ?? true)) - { - var tesTaskExecutorLog = tesTaskLog.GetOrAddExecutorLog(); - tesTaskExecutorLog.StartTime ??= batchInfo.BatchTaskStartTime ?? taskStartTime ?? tesTaskLog.StartTime; - tesTaskExecutorLog.EndTime ??= batchInfo.BatchTaskEndTime ?? taskEndTime ?? tesTaskLog.EndTime; - tesTaskExecutorLog.ExitCode ??= batchInfo.BatchTaskExitCode; - } if (batchInfo.OutputFileLogs is not null) { diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index 45b06171d..5f6129edd 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -252,7 +252,7 @@ static int ThenByName(RunnerEventsMessage message) : message.RunnerEventMessage.Name); static int ThenByIndex(RunnerEventsMessage message) - => int.Parse(message.RunnerEventMessage?.EventData.FirstOrDefault(p => "executor".Equals(p.Key)).Value ?? "0"); + => message.RunnerEventMessage?.EventData is null ? 0 : ParseExecutorIndex(message.RunnerEventMessage?.EventData) ?? 0; static int ParseEventName(string eventName) => EventsInOrder.TryGetValue(eventName, out var result) ? result : 0; @@ -264,6 +264,13 @@ public TOrder OrderBy(TItem item) => adapter(messageGetter(item)); } + static int? ParseExecutorIndex(IDictionary eventData) + // Maintain format with TesApi.Web.Events.RunnerEventsProcessor.ExecutorFormatted() + => (eventData?.TryGetValue("executor", out var value) ?? false) && + int.TryParse(value.Split('/', 2, StringSplitOptions.TrimEntries)[0], out var result) + ? result - 1 + : default; + /// /// Gets the task status details from this event message. /// @@ -369,13 +376,6 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes }; // Helpers - static int? ParseExecutorIndex(IDictionary eventData) - // Maintain format with TesApi.Web.Events.RunnerEventsProcessor.ExecutorFormatted() - => (eventData?.TryGetValue("executor", out var value) ?? false) && - int.TryParse(value.Split('/', 2, StringSplitOptions.TrimEntries)[0], out var result) - ? result - : default; - static IEnumerable GetOutputFileLogs(IDictionary eventData) { if (eventData is null || !eventData.TryGetValue("fileLog-Count", out var fileCount)) @@ -449,7 +449,7 @@ IAsyncEnumerable GetProcessLogsAsync(Tes.Runner.Events.EventMessage mess async ValueTask AddProcessLogsAsync(Tes.Models.TesTask tesTask, int index, CancellationToken cancellationToken) { - if (index < 0) + if (int.IsNegative(index)) { return; } diff --git a/src/TesApi.Web/IBatchScheduler.cs b/src/TesApi.Web/IBatchScheduler.cs index 923b9adc1..bdbce43f8 100644 --- a/src/TesApi.Web/IBatchScheduler.cs +++ b/src/TesApi.Web/IBatchScheduler.cs @@ -172,9 +172,9 @@ internal readonly bool Validate(bool validateEventsVersion) return validateEventsVersion ? !NormalizeForValidation(Events.RunnerEventsMessage.EventsVersion).OrderBy(pair => pair.Key, StringComparer.OrdinalIgnoreCase) - .SequenceEqual(NormalizeForValidation(EventsVersion).OrderBy(pair => pair.Key, StringComparer.OrdinalIgnoreCase)) + .SequenceEqual(NormalizeForValidation(EventsVersion ?? new Dictionary()).OrderBy(pair => pair.Key, StringComparer.OrdinalIgnoreCase)) : Events.RunnerEventsMessage.EventsVersion.Keys.Order(StringComparer.OrdinalIgnoreCase) - .SequenceEqual(EventsVersion.Keys.Order(StringComparer.OrdinalIgnoreCase), StringComparer.OrdinalIgnoreCase); + .SequenceEqual((EventsVersion ?? new Dictionary()).Keys.Order(StringComparer.OrdinalIgnoreCase), StringComparer.OrdinalIgnoreCase); } static Dictionary NormalizeForValidation(IDictionary value) diff --git a/src/TesApi.Web/TesApi.Web.csproj b/src/TesApi.Web/TesApi.Web.csproj index 54b2f4f5a..d43d915d4 100644 --- a/src/TesApi.Web/TesApi.Web.csproj +++ b/src/TesApi.Web/TesApi.Web.csproj @@ -5,6 +5,7 @@ net8.0 true true + true tesapi false GA4GH Task Execution Service From a20be53283a32e4c4ebac73732232ce3b303ca23 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 18 Feb 2025 10:33:01 -0800 Subject: [PATCH 197/202] fix process log name gathering --- src/TesApi.Web/Events/RunnerEventsProcessor.cs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index a64f538d6..266de72c9 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -379,7 +379,7 @@ public async Task GetMessageBatchStateAsync(RunnerEventsMes async ValueTask> AddProcessLogsIfAvailableAsync(Tes.Runner.Events.EventMessage message, Tes.Models.TesTask tesTask, CancellationToken cancellationToken) { - var processLogs = await GetProcessLogsAsync(message.Name, tesTask, cancellationToken).ToListAsync(cancellationToken); + var processLogs = await GetProcessLogsAsync(message, tesTask, cancellationToken).ToListAsync(cancellationToken); if (processLogs.Any()) { @@ -389,16 +389,25 @@ async ValueTask> AddProcessLogsIfAvailableAsync(Tes.Runner.E return processLogs; } - IAsyncEnumerable GetProcessLogsAsync(string messageName, Tes.Models.TesTask tesTask, CancellationToken cancellationToken) + IAsyncEnumerable GetProcessLogsAsync(Tes.Runner.Events.EventMessage message, Tes.Models.TesTask tesTask, CancellationToken cancellationToken) { - var blobNameStartsWith = messageName switch + var blobNameStartsWith = message.Name switch { Tes.Runner.Events.EventsPublisher.DownloadEndEvent => "download_std", - Tes.Runner.Events.EventsPublisher.ExecutorEndEvent => "exec_std", + Tes.Runner.Events.EventsPublisher.ExecutorEndEvent => "exec-", Tes.Runner.Events.EventsPublisher.UploadEndEvent => "upload_std", _ => string.Empty, }; + if (message.EventData.TryGetValue("executor", out var value) && + int.TryParse(value.Split('/', 2, StringSplitOptions.TrimEntries)[0], out var executor) + ? executor + : -1) + { + // Maintain format with Tes.RunnerCLI.Commands.CommandLauncher.LaunchesExecutorCommandAsSubProcessAsync() + blobNameStartsWith += $"{executor:D3}_std"; + } + return GetAvailableProcessLogsAsync(blobNameStartsWith, tesTask, cancellationToken).Select(t => t.Uri.AbsoluteUri); } From f655ce6c16214033a6d47155a4d62dc128731a90 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 18 Feb 2025 10:27:47 -0800 Subject: [PATCH 198/202] Use GITHUB_TOKEN to limit GitHub API rate-limiting (#834) --- .../deploy-tes-on-azure.csproj | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/deploy-tes-on-azure/deploy-tes-on-azure.csproj b/src/deploy-tes-on-azure/deploy-tes-on-azure.csproj index d3cc84be8..bd6897478 100644 --- a/src/deploy-tes-on-azure/deploy-tes-on-azure.csproj +++ b/src/deploy-tes-on-azure/deploy-tes-on-azure.csproj @@ -66,4 +66,19 @@ + + + + $(AppHostIntermediatePath) + + + + <__ItemsToRemove Remove="@(__ItemsToRemove)" /> + <__ItemsToRemove Condition=" '%(Filename)' == '$(_DotNetSingleFileHostExecutableNameWithoutExtension)' And '%(FullPath)' != '$(SingleFileHostIntermediatePath)' " + Include="@(_SourceItemsToCopyToPublishDirectoryAlways)" /> + <_SourceItemsToCopyToPublishDirectoryAlways Remove="@(__ItemsToRemove)" /> + <__ItemsToRemove Remove="@(__ItemsToRemove)" /> + + + From 34109e8e7cc96f6255e9a763ca8b1247c6ce10af Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 18 Feb 2025 10:29:05 -0800 Subject: [PATCH 199/202] Update AppArmor configuration (#836) --- .../scripts/helm/templates/tes-deployment.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/deploy-tes-on-azure/scripts/helm/templates/tes-deployment.yaml b/src/deploy-tes-on-azure/scripts/helm/templates/tes-deployment.yaml index 7c602f306..72289c685 100644 --- a/src/deploy-tes-on-azure/scripts/helm/templates/tes-deployment.yaml +++ b/src/deploy-tes-on-azure/scripts/helm/templates/tes-deployment.yaml @@ -16,8 +16,6 @@ spec: labels: io.kompose.service: tes azure.workload.identity/use: "true" - annotations: - container.apparmor.security.beta.kubernetes.io/tes: runtime/default spec: serviceAccountName: {{ .Values.identity.name }}-sa securityContext: @@ -25,6 +23,8 @@ spec: runAsGroup: 1000 fsGroup: 1000 supplementalGroups: [1000] + appArmorProfile: + type: RuntimeDefault terminationGracePeriodSeconds: 90 containers: - env: From 1e9453467d7494b128c3ff3fae842445af98b254 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 25 Feb 2025 12:19:24 -0800 Subject: [PATCH 200/202] Wait for storage account role propagation (#838) --- src/deploy-tes-on-azure/Deployer.cs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/deploy-tes-on-azure/Deployer.cs b/src/deploy-tes-on-azure/Deployer.cs index cde6f7f56..4bd757bfe 100644 --- a/src/deploy-tes-on-azure/Deployer.cs +++ b/src/deploy-tes-on-azure/Deployer.cs @@ -629,12 +629,16 @@ await Task.WhenAll( { ConsoleEx.WriteLine("Unable to assign 'Storage Blob Data Contributor' for deployment identity to the storage account. If the deployment fails as a result, the storage account must be precreated and the deploying user must have the 'Storage Blob Data Contributor' role for the storage account.", ConsoleColor.Yellow); } + else + { + await Task.Delay(TimeSpan.FromMinutes(5), cts.Token); + } - await WritePersonalizedFilesToStorageAccountAsync(storageAccountData); await AssignVmAsContributorToStorageAccountAsync(managedIdentity, storageAccount); await AssignVmAsDataOwnerToStorageAccountAsync(managedIdentity, storageAccount); await AssignManagedIdOperatorToResourceAsync(managedIdentity, resourceGroup); await AssignMIAsNetworkContributorToResourceAsync(managedIdentity, resourceGroup); + await WritePersonalizedFilesToStorageAccountAsync(storageAccountData); }), ]); From acf6fd3ef335cd51e3b2daace7cfcffd248ba38f Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Tue, 25 Feb 2025 12:58:05 -0800 Subject: [PATCH 201/202] Announce v6.0.0 deployment changes (#839) --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index b82c94ee9..79e5b03fb 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +### Announcement: Deployment [changes](https://github.com/microsoft/ga4gh-tes/wiki/6.0.0-Migration-Guide) in v6.0.0 + # GA4GH TES on Azure This project is an implementation of the [GA4GH Task Execution Service (TES)](https://github.com/ga4gh/task-execution-schemas), that provides distributed batch task execution on Microsoft Azure. The TES API is an effort to define a standardized schema and API for describing batch execution tasks. A task defines a set of input files, a set of Docker containers and commands to run, a set of output files, and some other logging and metadata. From 38a75ad06782631b6fe95b93090a4e8a78767a71 Mon Sep 17 00:00:00 2001 From: Blair L Murri Date: Thu, 27 Feb 2025 08:08:30 -0800 Subject: [PATCH 202/202] Fix bad git merge --- src/TesApi.Web/Events/RunnerEventsProcessor.cs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/TesApi.Web/Events/RunnerEventsProcessor.cs b/src/TesApi.Web/Events/RunnerEventsProcessor.cs index 266de72c9..2328c05d3 100644 --- a/src/TesApi.Web/Events/RunnerEventsProcessor.cs +++ b/src/TesApi.Web/Events/RunnerEventsProcessor.cs @@ -400,9 +400,7 @@ IAsyncEnumerable GetProcessLogsAsync(Tes.Runner.Events.EventMessage mess }; if (message.EventData.TryGetValue("executor", out var value) && - int.TryParse(value.Split('/', 2, StringSplitOptions.TrimEntries)[0], out var executor) - ? executor - : -1) + int.TryParse(value.Split('/', 2, StringSplitOptions.TrimEntries)[0], out var executor)) { // Maintain format with Tes.RunnerCLI.Commands.CommandLauncher.LaunchesExecutorCommandAsSubProcessAsync() blobNameStartsWith += $"{executor:D3}_std";