IA-4720 Calculate spark.driver.memory (#4007)

Qi77Qi · web-flow · commit ef0879015a4d · 2023-12-06T16:56:53.000-05:00
```
terra-quality-c73fdcc7/automation-test-a0fs2ov5z errored due to List(RuntimeError(Jupyter, Welder failed to start after 10 minutes.,None,2023-12-06T21:11:49Z,Some(TraceId(eafd22dba736444681b48f1541d714f5/3939911508519804487))))
```

One test failure and this PR doesn't affect GCE VMs..so going to merge as is
diff --git a/core/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/runtimeModels.scala b/core/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/runtimeModels.scala
@@ -542,8 +542,14 @@ object MemorySize {
  * Resource constraints for a runtime.
  * See https://docs.docker.com/compose/compose-file/compose-file-v2/#cpu-and-other-resources
  * for other types of resources we may want to add here.
+ *
+ * driverMemory will be populated if it's Dataproc, and machine type is either n1-standard-{x} or n1-highmem-{x} when using a different algorithm
+ * for calculating spark:spark.driver.memory
  */
-final case class RuntimeResourceConstraints(memoryLimit: MemorySize, totalMachineMemory: MemorySize)
+final case class RuntimeResourceConstraints(memoryLimit: MemorySize,
+                                            totalMachineMemory: MemorySize,
+                                            driverMemory: Option[MemorySize]
+)
 
 final case class RuntimeMetrics(cloudContext: CloudContext,
                                 runtimeName: RuntimeName,
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/DataprocInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/DataprocInterpreter.scala
@@ -786,13 +786,21 @@ class DataprocInterpreter[F[_]: Parallel](
       val sparkMemoryConfigRatio = config.dataprocConfig.sparkMemoryConfigRatio.getOrElse(0.8)
       // We still want a minimum to run Jupyter and other system processes.
       val minRuntimeMemoryGb = config.dataprocConfig.minimumRuntimeMemoryInGb.getOrElse(4.0)
+      // Note this algorithm is recommended by Hail team. See more info in https://broadworkbench.atlassian.net/browse/IA-4720
+      val sparkDriverMemory = machineType match {
+        case MachineTypeName(n1standard) if n1standard.startsWith("n1-standard") =>
+          Some(MemorySize.fromGb((total.bytes / MemorySize.gbInBytes - 7) * 0.9))
+        case MachineTypeName(n1highmem) if n1highmem.startsWith("n1-highmem") =>
+          Some(MemorySize.fromGb((total.bytes / MemorySize.gbInBytes - 11) * 0.9))
+        case _ => none[MemorySize]
+      }
       val runtimeAllocatedMemory =
         Math.max(
           (total.bytes * (1 - sparkMemoryConfigRatio)).toLong,
           MemorySize.fromGb(minRuntimeMemoryGb).bytes
         )
 
-      RuntimeResourceConstraints(MemorySize(runtimeAllocatedMemory), MemorySize(total.bytes))
+      RuntimeResourceConstraints(MemorySize(runtimeAllocatedMemory), MemorySize(total.bytes), sparkDriverMemory)
     }
 
   /**
@@ -887,8 +895,12 @@ class DataprocInterpreter[F[_]: Parallel](
       Map("dataproc:dataproc.allow.zero.workers" -> "true")
     } else Map.empty[String, String]
 
-    val memoryLimitInMb =
-      (jupyterResourceConstraints.totalMachineMemory.bytes - jupyterResourceConstraints.memoryLimit.bytes) / MemorySize.mbInBytes
+    val memoryLimitInMb = jupyterResourceConstraints.driverMemory match {
+      case Some(value) => value.bytes / MemorySize.mbInBytes
+      case None        =>
+        // We use a different algorithm to calculate spark.driver.memory when machine type is not n1-standard and n1-highmem
+        (jupyterResourceConstraints.totalMachineMemory.bytes - jupyterResourceConstraints.memoryLimit.bytes) / MemorySize.mbInBytes
+    }
     val driverMemoryProp = Map("spark:spark.driver.memory" -> s"${memoryLimitInMb}m")
 
     val yarnProps = Map(
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GceInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GceInterpreter.scala
@@ -544,7 +544,7 @@ class GceInterpreter[F[_]](
       gceAllocated = config.gceConfig.gceReservedMemory.map(_.bytes).getOrElse(0L)
       welderAllocated = config.welderConfig.welderReservedMemory.map(_.bytes).getOrElse(0L)
       result = MemorySize(total.bytes - gceAllocated - welderAllocated)
-    } yield RuntimeResourceConstraints(result, total)
+    } yield RuntimeResourceConstraints(result, total, None)
 
   private def buildNetworkInterfaces(runtimeProjectAndName: RuntimeProjectAndName,
                                      subnetwork: SubnetworkName,
diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/CommonTestData.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/CommonTestData.scala
@@ -239,7 +239,7 @@ object CommonTestData {
   val cryptoDetectorImage =
     RuntimeImage(CryptoDetector, "crypto/crypto:0.0.1", None, Instant.now.truncatedTo(ChronoUnit.MICROS))
 
-  val clusterResourceConstraints = RuntimeResourceConstraints(MemorySize.fromMb(3584), MemorySize.fromMb(7680))
+  val clusterResourceConstraints = RuntimeResourceConstraints(MemorySize.fromMb(3584), MemorySize.fromMb(7680), None)
   val hostToIpMapping = Ref.unsafe[IO, Map[String, IP]](Map.empty)
 
   def makeAsyncRuntimeFields(index: Int): AsyncRuntimeFields =
diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/DataprocInterpreterSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/DataprocInterpreterSpec.scala
@@ -46,6 +46,8 @@ import java.time.Instant
 import scala.concurrent.ExecutionContext.Implicits.global
 import scala.concurrent.Future
 import scala.concurrent.duration._
+import org.scalatest.prop.TableDrivenPropertyChecks.Table
+import org.scalatest.prop.TableDrivenPropertyChecks.forAll
 
 class DataprocInterpreterSpec
     extends TestKit(ActorSystem("leonardotest"))
@@ -214,7 +216,7 @@ class DataprocInterpreterSpec
       )
 
     val runtimeConfig = RuntimeConfig.DataprocConfig(0,
-                                                     MachineTypeName("n1-standard-4"),
+                                                     MachineTypeName("n2-standard-4"),
                                                      DiskSize(500),
                                                      None,
                                                      None,
@@ -251,9 +253,71 @@ class DataprocInterpreterSpec
 
   }
 
+  it should "calculate cluster resource constraints and software config correctly for n1-standard and n1-highmem machine types" in isolatedDbTest {
+    val highMemGoogleComputeService = new FakeGoogleComputeService {
+      override def getMachineType(project: GoogleProject, zone: ZoneName, machineTypeName: MachineTypeName)(implicit
+        ev: Ask[IO, TraceId]
+      ): IO[Option[MachineType]] =
+        IO.pure(Some(MachineType.newBuilder().setName("pass").setMemoryMb(104 * 1024).setGuestCpus(4).build()))
+    }
+
+    def dataprocInterpHighMem(computeService: GoogleComputeService[IO] = highMemGoogleComputeService,
+                              dataprocCluster: GoogleDataprocService[IO] = MockGoogleDataprocService,
+                              googleDirectoryDao: GoogleDirectoryDAO = mockGoogleDirectoryDAO
+    ) =
+      new DataprocInterpreter[IO](
+        Config.dataprocInterpreterConfig,
+        bucketHelper,
+        vpcInterp,
+        dataprocCluster,
+        computeService,
+        MockGoogleDiskService,
+        googleDirectoryDao,
+        mockGoogleIamDAO,
+        mockGoogleResourceService,
+        MockWelderDAO
+      )
+
+    val machineTypes = Table("machineType", MachineTypeName("n1-standard-4"), MachineTypeName("n1-highmem-64"))
+    forAll(machineTypes) { machineType: MachineTypeName =>
+      val runtimeConfig = RuntimeConfig.DataprocConfig(0,
+                                                       machineType,
+                                                       DiskSize(500),
+                                                       None,
+                                                       None,
+                                                       None,
+                                                       None,
+                                                       Map.empty[String, String],
+                                                       RegionName("us-central1"),
+                                                       true,
+                                                       false
+      )
+      val resourceConstraints = dataprocInterpHighMem()
+        .getDataprocRuntimeResourceContraints(testClusterClusterProjectAndName,
+                                              runtimeConfig.machineType,
+                                              RegionName("us-central1")
+        )
+        .unsafeRunSync()(cats.effect.unsafe.IORuntime.global)
+
+      val dataProcSoftwareConfig = dataprocInterp().getSoftwareConfig(
+        GoogleProject("MyGoogleProject"),
+        RuntimeName("MyRuntimeName"),
+        runtimeConfig,
+        resourceConstraints
+      )
+
+      val propertyMap = dataProcSoftwareConfig.getPropertiesMap()
+      val expectedMemory =
+        if (machineType == MachineTypeName("n1-standard-4")) (104 - 7) * 0.9 * 1024 else (104 - 11) * 0.9 * 1024
+      propertyMap.get(
+        "spark:spark.driver.memory"
+      ) shouldBe s"${expectedMemory.toInt}m"
+    }
+  }
+
   it should "create correct softwareConfig - minimum runtime memory 4gb" in isolatedDbTest {
     val runtimeConfig = RuntimeConfig.DataprocConfig(0,
-                                                     MachineTypeName("n1-highmem-64"),
+                                                     MachineTypeName("n2-highmem-64"),
                                                      DiskSize(500),
                                                      None,
                                                      None,