diff --git a/.gitignore b/.gitignore index 6f19958..548f130 100644 --- a/.gitignore +++ b/.gitignore @@ -24,4 +24,5 @@ build/ *.tfstate* *.lock.hcl .oci +.aws .terraform diff --git a/providers/aws/eks.tf b/providers/aws/eks.tf new file mode 100644 index 0000000..9cd2eb7 --- /dev/null +++ b/providers/aws/eks.tf @@ -0,0 +1,148 @@ +data "aws_availability_zones" "available" { state = "available" } +data "aws_region" "current" {} + +module "eks" { + source = "terraform-aws-modules/eks/aws" + version = "~> 20.31" + + cluster_name = local.name_prefix + cluster_version = "1.32" + cluster_endpoint_private_access = true + cluster_endpoint_public_access = true + enable_irsa = true + + vpc_id = module.vpc.vpc_id + subnet_ids = module.vpc.private_subnets + + eks_managed_node_groups = { + for k, v in var.node_groups : k => { + desired_size = v.min_size + max_size = v.max_size + min_size = v.min_size + + ami_type = v.gpu_count == 0 ? null : "AL2_x86_64_GPU" + block_device_mappings = { + xvda = { + device_name = "/dev/xvda" + ebs = { + volume_size = v.root_disk_size_gb + } + } + } + + capacity_type = v.spot ? "SPOT" : "ON_DEMAND" + instance_types = [v.instance_type] + labels = merge( + v.gpu_count == 0 ? {} : { + "k8s.amazonaws.com/accelerator" = v.gpu_accelerator + }, + v.dedicated_node_role == null ? {} : { + "flyte.org/node-role" = v.dedicated_node_role + } + ) + + subnet_ids = module.vpc.private_subnets + tags = { + "k8s.io/cluster-autoscaler/enabled" = true + "k8s.io/cluster-autoscaler/${local.name_prefix}" = true + } + + taints = v.gpu_count == 0 ? [] : [ + { + key = "nvidia.com/gpu" + value = "present" + effect = "NO_SCHEDULE" + } + ] + + iam_role_additional_policies = { + "CloudWatchAgentPolicy" = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy" + } + } + } +} + +resource "aws_autoscaling_group_tag" "eks_managed_node_group_asg_tag" { + for_each = merge([ + for mng, tags in local.nodegroup_asg_tags : { + for tag_key, tag_value in tags : "${mng}-${replace(tag_key, "k8s.io/cluster-autoscaler/node-template/", "")}" => { + mng = mng + key = tag_key + value = tag_value + } + } + ]...) + + autoscaling_group_name = one(module.eks.eks_managed_node_groups[each.value.mng].node_group_autoscaling_group_names) + + tag { + key = each.value.key + value = each.value.value + propagate_at_launch = false + } + + depends_on = [module.eks] +} + +data "aws_eks_cluster_auth" "default" { + name = module.eks.cluster_name +} + +module "aws_load_balancer_controller_irsa_role" { + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "5.11.2" + + role_name = "${local.name_prefix}-aws-load-balancer-controller" + attach_load_balancer_controller_policy = false + + oidc_providers = { + ex = { + provider_arn = module.eks.oidc_provider_arn + namespace_service_accounts = ["kube-system:aws-load-balancer-controller"] + } + } +} + +module "cluster_autoscaler_irsa_role" { + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "5.11.2" + + role_name = "${local.name_prefix}-cluster-autoscaler" + attach_cluster_autoscaler_policy = true + cluster_autoscaler_cluster_ids = [module.eks.cluster_name] + + oidc_providers = { + default = { + provider_arn = module.eks.oidc_provider_arn + namespace_service_accounts = ["kube-system:aws-cluster-autoscaler"] + } + } +} + +resource "helm_release" "aws_cluster_autoscaler" { + namespace = "kube-system" + wait = true + timeout = 600 + + name = "aws-cluster-autoscaler" + + repository = "https://kubernetes.github.io/autoscaler" + chart = "cluster-autoscaler" + version = "9.24.0" + + set { + name = "autoDiscovery.clusterName" + value = module.eks.cluster_name + } + + set { + name = "awsRegion" + value = data.aws_region.current.name + } + + set { + name = "rbac.serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn" + value = module.cluster_autoscaler_irsa_role.iam_role_arn + } + depends_on = [ module.eks ] +} diff --git a/providers/aws/eks_auth.tf b/providers/aws/eks_auth.tf new file mode 100644 index 0000000..87c28d0 --- /dev/null +++ b/providers/aws/eks_auth.tf @@ -0,0 +1,62 @@ +data "aws_iam_roles" "admin_regexes" { + for_each = toset(var.admin_role_regexes) + name_regex = each.key +} + +locals { + admin_regex_role_arns = [ + for role in data.aws_iam_roles.admin_regexes : one(role.arns) + ] + + pathless_admin_regex_role_arns = [ + for parts in [for arn in local.admin_regex_role_arns : split("/", arn)] : + format("%s/%s", parts[0], element(parts, length(parts) - 1)) + ] + + admin_role_arns = concat( + local.pathless_admin_regex_role_arns, + var.admin_role_arns, + ) + + admin_role_configmap_data = [ + for role_arn in local.admin_role_arns : { + rolearn = role_arn + username = "union-admin" + groups = ["system:masters"] + } + ] + + node_role_configmap_data = [ + for role_arn in var.node_role_arns : { + rolearn = role_arn + username = "system:node:{{EC2PrivateDNSName}}" + groups = ["system:bootstrappers", "system:nodes"] + } + ] + + admin_user_arns = [ + for role_arn in var.admin_user_arns : { + userarn = role_arn + groups = ["system:masters"] + } + ] + + aws_auth_configmap_data = { + mapRoles = replace(yamlencode(concat( + local.admin_role_configmap_data, + local.node_role_configmap_data, + )), "\"", "") + mapUsers = replace(yamlencode(local.admin_user_arns), "\"", "") + } +} + +resource "kubernetes_config_map_v1_data" "aws_auth" { + force = true + + metadata { + name = "aws-auth" + namespace = "kube-system" + } + + data = local.aws_auth_configmap_data +} diff --git a/providers/aws/iam.tf b/providers/aws/iam.tf new file mode 100644 index 0000000..b47308c --- /dev/null +++ b/providers/aws/iam.tf @@ -0,0 +1,84 @@ +locals { + union_backend_ksas = ["flytepropeller"] + union_ksas = ["default"] #The KSA that Task Pods will use + + union_worker_wi_members = toset([ + for tpl in setproduct( + local.union_projects, + local.union_domains, + local.union_ksas + ) : format("%s-%s:%s", tpl...) + ]) +} + +data "aws_iam_policy_document" "union_data_bucket_policy" { + statement { + sid = "" + effect = "Allow" + actions = [ + "s3:DeleteObject*", + "s3:GetObject*", + "s3:ListBucket", + "s3:PutObject*" + ] + resources = [ + "arn:aws:s3:::${module.union-data.s3_bucket_id}", + "arn:aws:s3:::${module.union-data.s3_bucket_id}/*" + ] + } +} + +data "aws_iam_policy_document" "union_backend_iam_policy" { + source_policy_documents = compact([ + data.aws_iam_policy_document.union_data_bucket_policy.json + ]) +} + +resource "aws_iam_policy" "union_backend_iam_policy" { + name = "${local.name_prefix}-flyte-backend-iam-policy" + policy = data.aws_iam_policy_document.union_backend_iam_policy.json +} + +module "union_backend_irsa_role" { + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "5.11.2" + assume_role_condition_test = "StringEquals" + role_name = "${local.name_prefix}-backend-role" + role_policy_arns = { + default = aws_iam_policy.union_backend_iam_policy.arn + } + oidc_providers = { + default = { + provider_arn = module.eks.oidc_provider_arn + namespace_service_accounts = ["flyte:flytepropeller","flyte:flyteadmin","flyte:datacatalog"] + } + } +} + +data "aws_iam_policy_document" "union_worker_iam_policy" { + source_policy_documents = compact([ + data.aws_iam_policy_document.union_data_bucket_policy.json + ]) +} + +resource "aws_iam_policy" "flyte_worker_iam_policy" { + name = "${local.name_prefix}-flyte-worker-iam-policy" + policy = data.aws_iam_policy_document.union_worker_iam_policy.json +} + +module "flyte_worker_irsa_role" { + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "5.11.2" + assume_role_condition_test = "StringEquals" + role_name = "${local.name_prefix}-flyte-worker" + role_policy_arns = { + default = aws_iam_policy.flyte_worker_iam_policy.arn + } + + oidc_providers = { + default = { + provider_arn = module.eks.oidc_provider_arn + namespace_service_accounts = local.union_worker_wi_members + } + } +} diff --git a/providers/aws/locals.tf b/providers/aws/locals.tf new file mode 100644 index 0000000..c6d858f --- /dev/null +++ b/providers/aws/locals.tf @@ -0,0 +1,52 @@ +locals { + # move to vars + project = "union" + environment = "terraform" + name_prefix = "${local.project}-${local.environment}" + account_id = data.aws_caller_identity.current.account_id + + union_projects = ["flytesnacks"] + union_domains = ["development", "staging", "production"] + + # move to vars as well + azs = data.aws_availability_zones.available.zone_ids + main_cidr_block = "10.0.0.0/16" + private_subnets = [ + for idx, _ in local.azs : + format("10.%d.0.0/16", idx + 1) + ] + public_subnets = [ + for idx, _ in local.azs : + format("10.0.%d.0/24", idx + 1) + ] + database_subnets = [ + for idx, _ in local.azs : + format("10.0.%d.0/24", idx + 10) + ] + + nodegroup_asg_tags = { + for k, v in var.node_groups : k => merge( + # Spot + v.spot ? { + "k8s.io/cluster-autoscaler/node-template/label/eks.amazonaws.com/capacityType" = "SPOT" + } : { + "k8s.io/cluster-autoscaler/node-template/label/eks.amazonaws.com/capacityType" = "ON_DEMAND" + }, + # Ephemeral storage + { + "k8s.io/cluster-autoscaler/node-template/resources/ephemeral-storage" = "${v.root_disk_size_gb}G" + }, + # GPUs + v.gpu_count == 0 ? {} : { + "k8s.io/cluster-autoscaler/node-template/label/k8s.amazonaws.com/accelerator" = v.gpu_accelerator + "k8s.io/cluster-autoscaler/node-template/resources/nvidia.com/gpu" = tostring(v.gpu_count) + "k8s.io/cluster-autoscaler/node-template/taint/nvidia.com/gpu" = "present:NoSchedule" + }, + # Dedicated node role + v.dedicated_node_role == null ? {} : { + "k8s.io/cluster-autoscaler/node-template/label/flyte.org/node-role" = v.dedicated_node_role + "k8s.io/cluster-autoscaler/node-template/taint/flyte.org/node-role" = "${v.dedicated_node_role}:NoSchedule" + } + ) + } +} diff --git a/providers/aws/output.tf b/providers/aws/output.tf new file mode 100644 index 0000000..e69de29 diff --git a/providers/aws/provider.tf b/providers/aws/provider.tf new file mode 100644 index 0000000..bad4fe3 --- /dev/null +++ b/providers/aws/provider.tf @@ -0,0 +1,48 @@ +data "aws_caller_identity" "current" {} + +terraform { + required_version = ">= 1.3.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0.0" + } + } +} + +provider "aws" { + region = var.aws_region + profile = var.aws_cli_profile +} + +provider "helm" { + kubernetes { + host = module.eks.cluster_endpoint + # token = data.aws_eks_cluster_auth.default.token + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + exec { + api_version = "client.authentication.k8s.io/v1beta1" + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + command = "aws" + } + } +} + +provider "kubernetes" { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.default.token + # exec { + # api_version = "client.authentication.k8s.io/v1beta1" + # args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + # command = "aws" + # } +} + +# provider "kubectl" { +# host = module.eks.cluster_endpoint +# cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) +# token = data.aws_eks_cluster_auth.default.token +# load_config_file = false +# } \ No newline at end of file diff --git a/providers/aws/storage.tf b/providers/aws/storage.tf new file mode 100644 index 0000000..d7ec944 --- /dev/null +++ b/providers/aws/storage.tf @@ -0,0 +1,11 @@ +module "union-data" { + source = "terraform-aws-modules/s3-bucket/aws" + version = "4.6.0" + + # TODO: change me to something different (var) + bucket = "${local.name_prefix}-data" + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} diff --git a/providers/aws/terraform.tfvars b/providers/aws/terraform.tfvars new file mode 100644 index 0000000..9d79600 --- /dev/null +++ b/providers/aws/terraform.tfvars @@ -0,0 +1,5 @@ +# Sample tfvars file. Uncomment out values to use +# Do not commit this file to Git with sensitive values + +#aws_cli_profile = +#aws_region = diff --git a/providers/aws/variables.tf b/providers/aws/variables.tf new file mode 100644 index 0000000..fbc3cbf --- /dev/null +++ b/providers/aws/variables.tf @@ -0,0 +1,53 @@ +variable "aws_cli_profile" { +type = string +} + +variable "aws_region"{ + type = string +} + +variable "node_groups" { + type = map(object({ + instance_type = string + dedicated_node_role = string + min_size = number + max_size = number + root_disk_size_gb = number + spot = bool + gpu_accelerator = string + gpu_count = number + })) + default = { + worker-on-demand = { + instance_type = "m7i.xlarge" + dedicated_node_role = "worker" + min_size = 2 + max_size = 5 + root_disk_size_gb = 500 + spot = false + gpu_accelerator = "" + gpu_count = 0 + } + } +} + +variable "admin_role_arns" { + type = list(string) + default = [] +} + +variable "admin_user_arns" { + type = list(string) + default = [] +} + +variable "node_role_arns" { + type = list(string) + default = [] +} + +variable "admin_role_regexes" { + type = list(string) + default = [] +} + diff --git a/providers/aws/vpc.tf b/providers/aws/vpc.tf new file mode 100644 index 0000000..1b18e15 --- /dev/null +++ b/providers/aws/vpc.tf @@ -0,0 +1,29 @@ +module "vpc" { + source = "terraform-aws-modules/vpc/aws" + version = "5.19.0" + + name = local.name_prefix + cidr = "10.0.0.0/16" + secondary_cidr_blocks = local.private_subnets + + azs = local.azs + private_subnets = local.private_subnets + public_subnets = local.public_subnets + + enable_nat_gateway = true + single_nat_gateway = true +} + +module "vpc_endpoints" { + source = "terraform-aws-modules/vpc/aws//modules/vpc-endpoints" + version = "5.19.0" + + vpc_id = module.vpc.vpc_id + endpoints = { + s3 = { + service = "s3" + service_type = "Gateway" + route_table_ids = module.vpc.private_route_table_ids + } + } +}