From b9d56e891e8eaa5fa462b5cfb7f1284086e6c6fc Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Thu, 7 Dec 2023 17:50:11 +0100 Subject: [PATCH 01/46] first blood --- tf-managed/.gitignore | 34 +++++ tf-managed/README.md | 11 ++ tf-managed/common/ruby_common/docker_utils.rb | 12 ++ tf-managed/common/ruby_common/slack_client.rb | 56 ++++++++ tf-managed/common/ruby_common/utils.rb | 9 ++ tf-managed/live/.gitignore | 15 +++ tf-managed/live/README.md | 1 + .../dev/sync-check/terragrunt.hcl | 15 +++ .../prod/sync-check/terragrunt.hcl | 16 +++ tf-managed/live/terragrunt.hcl | 45 +++++++ tf-managed/modules/sync-check/main.tf | 127 ++++++++++++++++++ tf-managed/modules/sync-check/outputs.tf | 4 + tf-managed/modules/sync-check/prep_sources.sh | 14 ++ .../sync-check/service/Dockerfile-tester | 13 ++ tf-managed/modules/sync-check/service/Gemfile | 7 + .../modules/sync-check/service/Gemfile.lock | 41 ++++++ .../sync-check/service/docker-compose.yml | 118 ++++++++++++++++ .../modules/sync-check/service/forest-env.tpl | 9 ++ .../sync-check/service/health_check.sh | 112 +++++++++++++++ tf-managed/modules/sync-check/service/init.sh | 66 +++++++++ .../sync-check/service/restart.service | 9 ++ .../modules/sync-check/service/restart.sh | 7 + .../modules/sync-check/service/run_service.sh | 49 +++++++ .../modules/sync-check/service/sync_check.rb | 59 ++++++++ .../sync-check/service/sync_check.toml | 4 + .../sync-check/service/sync_check_process.rb | 123 +++++++++++++++++ tf-managed/modules/sync-check/variables.tf | 68 ++++++++++ 27 files changed, 1044 insertions(+) create mode 100644 tf-managed/.gitignore create mode 100644 tf-managed/README.md create mode 100644 tf-managed/common/ruby_common/docker_utils.rb create mode 100644 tf-managed/common/ruby_common/slack_client.rb create mode 100644 tf-managed/common/ruby_common/utils.rb create mode 100644 tf-managed/live/.gitignore create mode 100644 tf-managed/live/README.md create mode 100644 tf-managed/live/environments/dev/sync-check/terragrunt.hcl create mode 100644 tf-managed/live/environments/prod/sync-check/terragrunt.hcl create mode 100644 tf-managed/live/terragrunt.hcl create mode 100644 tf-managed/modules/sync-check/main.tf create mode 100644 tf-managed/modules/sync-check/outputs.tf create mode 100755 tf-managed/modules/sync-check/prep_sources.sh create mode 100644 tf-managed/modules/sync-check/service/Dockerfile-tester create mode 100644 tf-managed/modules/sync-check/service/Gemfile create mode 100644 tf-managed/modules/sync-check/service/Gemfile.lock create mode 100644 tf-managed/modules/sync-check/service/docker-compose.yml create mode 100644 tf-managed/modules/sync-check/service/forest-env.tpl create mode 100755 tf-managed/modules/sync-check/service/health_check.sh create mode 100755 tf-managed/modules/sync-check/service/init.sh create mode 100644 tf-managed/modules/sync-check/service/restart.service create mode 100755 tf-managed/modules/sync-check/service/restart.sh create mode 100755 tf-managed/modules/sync-check/service/run_service.sh create mode 100755 tf-managed/modules/sync-check/service/sync_check.rb create mode 100644 tf-managed/modules/sync-check/service/sync_check.toml create mode 100755 tf-managed/modules/sync-check/service/sync_check_process.rb create mode 100644 tf-managed/modules/sync-check/variables.tf diff --git a/tf-managed/.gitignore b/tf-managed/.gitignore new file mode 100644 index 000000000..9b8a46e69 --- /dev/null +++ b/tf-managed/.gitignore @@ -0,0 +1,34 @@ +# Local .terraform directories +**/.terraform/* + +# .tfstate files +*.tfstate +*.tfstate.* + +# Crash log files +crash.log +crash.*.log + +# Exclude all .tfvars files, which are likely to contain sensitive data, such as +# password, private keys, and other secrets. These should not be part of version +# control as they are data points which are potentially sensitive and subject +# to change depending on the environment. +*.tfvars +*.tfvars.json + +# Ignore override files as they are usually used to override resources locally and so +# are not checked in +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Include override files you do wish to add to version control using negated pattern +# !example_override.tf + +# Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan +# example: *tfplan* + +# Ignore CLI configuration files +.terraformrc +terraform.rc diff --git a/tf-managed/README.md b/tf-managed/README.md new file mode 100644 index 000000000..316ed2ceb --- /dev/null +++ b/tf-managed/README.md @@ -0,0 +1,11 @@ +# Terraform-managed + +This directory contains services and assets managed via Terraform/Terragrunt. + +# Structure + +``` +├── common <- common code, shared between all modules (TODO maybe move it to modules?) +├── live <- configurations for different environments. +└── modules <- service and resources definitions +``` diff --git a/tf-managed/common/ruby_common/docker_utils.rb b/tf-managed/common/ruby_common/docker_utils.rb new file mode 100644 index 000000000..6eee6a26d --- /dev/null +++ b/tf-managed/common/ruby_common/docker_utils.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +require 'docker' + +# Tools to facilitate interacting with Docker +module DockerUtils + # returns the specified container logs as String + def self.get_container_logs(container_name) + container = Docker::Container.get container_name + container.streaming_logs(stdout: true, stderr: true) + end +end diff --git a/tf-managed/common/ruby_common/slack_client.rb b/tf-managed/common/ruby_common/slack_client.rb new file mode 100644 index 000000000..0165d1b2e --- /dev/null +++ b/tf-managed/common/ruby_common/slack_client.rb @@ -0,0 +1,56 @@ +# frozen_string_literal: true + +require 'slack-ruby-client' + +# Wrapper Slack client class to handle sending messages and uploading logs. +class SlackClient + @last_thread = nil + @channel = nil + @client = nil + + def initialize(channel, token) + raise "Invalid channel name: #{channel}, must start with #" unless channel.start_with? '#' + raise 'Missing token' if token.nil? + + Slack.configure do |config| + config.token = token + end + + @channel = channel + @client = Slack::Web::Client.new + end + + # Posts a new message to configured channel. + def post_message(text) + msg = @client.chat_postMessage(channel: @channel, text: text) + @last_thread = msg[:ts] + end + + # Attaches a comment/reply to the latest posted thread. + def attach_comment(comment) + raise 'Need to create a thread before attaching a comment.' if @last_thread.nil? + + @client.chat_postMessage(channel: @channel, thread_ts: @last_thread, text: comment) + end + + # Attaches files to the last posted thread. + def attach_files(*files) + files.each do |file| + attach_file file + end + end + + # Attaches a file to the latest posted thread. + def attach_file(file) + raise "No such file #{file}" unless File.exist? file + raise 'Need to create a thread before attaching a file.' if @last_thread.nil? + + @client.files_upload( + channels: @channel, + file: Faraday::UploadIO.new(file, 'text/plain'), + filename: File.basename(file), + initial_comment: 'Attached a file.', + thread_ts: @last_thread + ) + end +end diff --git a/tf-managed/common/ruby_common/utils.rb b/tf-managed/common/ruby_common/utils.rb new file mode 100644 index 000000000..a8fcce5b2 --- /dev/null +++ b/tf-managed/common/ruby_common/utils.rb @@ -0,0 +1,9 @@ +# frozen_string_literal: true + +# Retrieves an environmental variable, failing if its not set or empty. +def get_and_assert_env_variable(name) + var = ENV.fetch(name, nil) + raise "Please set #{name} environmental variable" if var.nil? || var.empty? + + var +end diff --git a/tf-managed/live/.gitignore b/tf-managed/live/.gitignore new file mode 100644 index 000000000..466339266 --- /dev/null +++ b/tf-managed/live/.gitignore @@ -0,0 +1,15 @@ +*.tf +.*.sw? +.idea +terragrunt.iml +vendor +.terraform +.vscode +*.tfstate +*.tfstate.backup +*.out +.terragrunt-cache +.bundle +.ruby-version +.terraform.lock.hcl +.DS_Store diff --git a/tf-managed/live/README.md b/tf-managed/live/README.md new file mode 100644 index 000000000..8f0005319 --- /dev/null +++ b/tf-managed/live/README.md @@ -0,0 +1 @@ +All Terragrunt configurations live here. To edit Terraform files, go to ../modules diff --git a/tf-managed/live/environments/dev/sync-check/terragrunt.hcl b/tf-managed/live/environments/dev/sync-check/terragrunt.hcl new file mode 100644 index 000000000..e4ea12461 --- /dev/null +++ b/tf-managed/live/environments/dev/sync-check/terragrunt.hcl @@ -0,0 +1,15 @@ +# Automatically find the root terragrunt.hcl and inherit its +# configuration +include { + path = find_in_parent_folders() +} + +# Load the actual Terraform module +terraform { + source = format("%s/../modules/sync-check", get_parent_terragrunt_dir()) +} + +inputs = { + name = "hubert-sync-check-dev" # TODO get environment from terragrunt + size = "s-4vcpu-16gb-amd" +} diff --git a/tf-managed/live/environments/prod/sync-check/terragrunt.hcl b/tf-managed/live/environments/prod/sync-check/terragrunt.hcl new file mode 100644 index 000000000..0e1a2da61 --- /dev/null +++ b/tf-managed/live/environments/prod/sync-check/terragrunt.hcl @@ -0,0 +1,16 @@ +# Automatically find the root terragrunt.hcl and inherit its +# configuration +include { + path = find_in_parent_folders() +} + +# Load the actual Terraform module +terraform { + source = format("%s/../modules/sync-check", get_parent_terragrunt_dir()) +} + +inputs = { + # Configure service: + name = "hubert-sync-check-prod" # TODO get environment from terragrunt + size = "s-4vcpu-16gb-amd" # droplet size +} diff --git a/tf-managed/live/terragrunt.hcl b/tf-managed/live/terragrunt.hcl new file mode 100644 index 000000000..b0f6523aa --- /dev/null +++ b/tf-managed/live/terragrunt.hcl @@ -0,0 +1,45 @@ +locals { + # Parse the file path we're in to read the env name: e.g., env + # will be "dev" in the dev folder, "stage" in the stage folder, + # etc. + parsed = regex(".*/environments/(?P.*?)/.*", get_terragrunt_dir()) + env = local.parsed.env +} + +# Remote state, separate for each environment +remote_state { + backend = "s3" + generate = { + path = "backend.tf" + if_exists = "overwrite_terragrunt" + } + config = { + // if the environment is dev, use the dev bucket, otherwise use the prod bucket + bucket = (local.env == "prod" + ? "hubert-bucket-prod" + : "hubert-bucket-dev" + ) + key = "${local.env}-terraform.tfstate" + region = "eu-west-1" + endpoint = "https://fra1.digitaloceanspaces.com" + //endpoints = { + // s3 = "https://fra1.digitaloceanspaces.com" + //} + skip_bucket_versioning = true + skip_bucket_ssencryption = true + skip_bucket_root_access = true + skip_bucket_public_access_blocking = true + skip_bucket_enforced_tls = true + skip_credentials_validation = true + skip_metadata_api_check = true + skip_requesting_account_id = true + skip_s3_checksum = true + skip_region_validation = true + } +} + +# Common inputs for all the services. +inputs = { + common_resources_dir = format("%s/../common", get_parent_terragrunt_dir()) + slack_channel = (local.env == "prod" ? "#forest-notifications" : "#forest-dump") +} diff --git a/tf-managed/modules/sync-check/main.tf b/tf-managed/modules/sync-check/main.tf new file mode 100644 index 000000000..a323f3ea3 --- /dev/null +++ b/tf-managed/modules/sync-check/main.tf @@ -0,0 +1,127 @@ +# This terraform script executes the following steps: +# - Zip the ruby and shell script files (the hash of this zip file is used to +# determine when to re-deploy the service) +# - Boot a new droplet +# - Copy over the zip file +# - Run calibnet and mainnet sync check in the background + +terraform { + required_version = "~> 1.3" + + required_providers { + digitalocean = { + source = "digitalocean/digitalocean" + version = "~> 2.0" + } + external = { + source = "hashicorp/external" + version = "~> 2.1" + } + local = { + source = "hashicorp/local" + version = "~> 2.1" + } + + } +} + +provider "digitalocean" { + token = var.digitalocean_token +} + +// Ugly hack because 'archive_file' cannot mix files and folders. +data "external" "sources_tar" { + program = ["sh", "${path.module}/prep_sources.sh", path.module, var.common_resources_dir] +} + +data "local_file" "sources" { + filename = data.external.sources_tar.result.path +} + +// Note: The init.sh file is also included in the sources.zip such that the hash +// of the archive captures the entire state of the machine. +// This is a workaround, and because of this, we need to suppress the tflint warning here +// for unused declarations related to the 'init.sh' file. +// tflint-ignore: terraform_unused_declarations +data "local_file" "init" { + filename = "${path.module}/service/init.sh" +} + +data "digitalocean_ssh_keys" "keys" { + sort { + key = "name" + direction = "asc" + } +} + +# Set required environment variables +locals { + env_content = templatefile("${path.module}/service/forest-env.tpl", { + FOREST_TARGET_DATA = "/volumes/forest_data", + FOREST_TARGET_SCRIPTS = "/volumes/sync_check", + FOREST_TARGET_RUBY_COMMON = "/volumes/ruby_common", + slack_token = var.slack_token, + slack_channel = var.slack_channel, + NEW_RELIC_API_KEY = var.NEW_RELIC_API_KEY, + NEW_RELIC_ACCOUNT_ID = var.NEW_RELIC_ACCOUNT_ID, + NEW_RELIC_REGION = var.NEW_RELIC_REGION, + forest_tag = "edge" + }) +} + +locals { + init_commands = [ + "tar xf sources.tar", + # Set required environment variables + "echo '${local.env_content}' >> /root/.forest_env", + "echo '. ~/.forest_env' >> .bashrc", + ". ~/.forest_env", + "nohup sh ./init.sh > init_log.txt &", + "cp ./restart.service /etc/systemd/system/", + "systemctl enable restart.service", + # Exiting without a sleep sometimes kills the script :-/ + "sleep 60s", + ] +} + +resource "digitalocean_droplet" "forest" { + image = var.image + name = var.name + region = var.region + size = var.size + # Re-initialize resource if this hash changes: + user_data = join("-", [data.local_file.sources.content_sha256, sha256(join("", local.init_commands))]) + tags = ["iac"] + ssh_keys = data.digitalocean_ssh_keys.keys.ssh_keys[*].fingerprint + monitoring = true + + graceful_shutdown = false + + connection { + host = self.ipv4_address + user = "root" + type = "ssh" + } + + # Push the sources.tar file to the newly booted droplet + provisioner "file" { + source = data.local_file.sources.filename + destination = "/root/sources.tar" + } + + provisioner "remote-exec" { + inline = local.init_commands + } +} + +data "digitalocean_project" "forest_project" { + name = var.project +} + +# Connect the droplet to the forest project (otherwise it ends up in +# "ChainBridge" which is the default project) +resource "digitalocean_project_resources" "connect_forest_project" { + project = data.digitalocean_project.forest_project.id + resources = [digitalocean_droplet.forest.urn] +} + diff --git a/tf-managed/modules/sync-check/outputs.tf b/tf-managed/modules/sync-check/outputs.tf new file mode 100644 index 000000000..240c103f1 --- /dev/null +++ b/tf-managed/modules/sync-check/outputs.tf @@ -0,0 +1,4 @@ +# This ip address may be used in the future by monitoring software +output "ip" { + value = [digitalocean_droplet.forest.ipv4_address] +} diff --git a/tf-managed/modules/sync-check/prep_sources.sh b/tf-managed/modules/sync-check/prep_sources.sh new file mode 100755 index 000000000..6324e5da4 --- /dev/null +++ b/tf-managed/modules/sync-check/prep_sources.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# Enable strict error handling and command tracing +set -euxo pipefail + +# Copy local source files in a folder together with ruby_common and create a zip archive. + +cd "$1" +cp --archive $2/ruby_common service/ + +rm -f sources.tar +(cd service && tar cf ../sources.tar --sort=name --mtime='UTC 2019-01-01' ./* > /dev/null 2>&1) +rm -fr service/ruby_common +echo "{ \"path\": \"$1/sources.tar\" }" diff --git a/tf-managed/modules/sync-check/service/Dockerfile-tester b/tf-managed/modules/sync-check/service/Dockerfile-tester new file mode 100644 index 000000000..ed8b1aa89 --- /dev/null +++ b/tf-managed/modules/sync-check/service/Dockerfile-tester @@ -0,0 +1,13 @@ +FROM ubuntu:22.04 + +RUN apt-get update && \ + apt-get install --no-install-recommends -y docker ruby make gcc build-essential ruby-dev curl \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /chainsafe + +COPY Gemfile Gemfile.lock health_check.sh sync_check_process.rb sync_check.rb sync_check.toml ./ +COPY ruby_common ruby_common + +RUN gem install bundler && bundle install diff --git a/tf-managed/modules/sync-check/service/Gemfile b/tf-managed/modules/sync-check/service/Gemfile new file mode 100644 index 000000000..e1a861576 --- /dev/null +++ b/tf-managed/modules/sync-check/service/Gemfile @@ -0,0 +1,7 @@ +# frozen_string_literal: true + +source 'https://rubygems.org' + +gem 'docker-api', '>= 2.2.0' +gem 'slack-ruby-client', '>= 2.1.0' +gem 'sys-filesystem', '>=1.4.3' diff --git a/tf-managed/modules/sync-check/service/Gemfile.lock b/tf-managed/modules/sync-check/service/Gemfile.lock new file mode 100644 index 000000000..9832df215 --- /dev/null +++ b/tf-managed/modules/sync-check/service/Gemfile.lock @@ -0,0 +1,41 @@ +GEM + remote: https://rubygems.org/ + specs: + docker-api (2.2.0) + excon (>= 0.47.0) + multi_json + excon (0.99.0) + faraday (2.7.4) + faraday-net_http (>= 2.0, < 3.1) + ruby2_keywords (>= 0.0.4) + faraday-mashify (0.1.1) + faraday (~> 2.0) + hashie + faraday-multipart (1.0.4) + multipart-post (~> 2) + faraday-net_http (3.0.2) + ffi (1.15.5) + gli (2.21.0) + hashie (5.0.0) + multi_json (1.15.0) + multipart-post (2.3.0) + ruby2_keywords (0.0.5) + slack-ruby-client (2.1.0) + faraday (>= 2.0) + faraday-mashify + faraday-multipart + gli + hashie + sys-filesystem (1.4.3) + ffi (~> 1.1) + +PLATFORMS + x86_64-linux + +DEPENDENCIES + docker-api (>= 2.2.0) + slack-ruby-client (>= 2.1.0) + sys-filesystem (>= 1.4.3) + +BUNDLED WITH + 2.3.4 diff --git a/tf-managed/modules/sync-check/service/docker-compose.yml b/tf-managed/modules/sync-check/service/docker-compose.yml new file mode 100644 index 000000000..51547105d --- /dev/null +++ b/tf-managed/modules/sync-check/service/docker-compose.yml @@ -0,0 +1,118 @@ +# Docker compose file to run continuous Forest sync on both mainnet and calibnet. + +version: "3.7" + +services: + forest_mainnet: + image: ghcr.io/chainsafe/forest:${FOREST_TAG} + hostname: forest-mainnet + container_name: forest-mainnet + networks: + - mainnet + volumes: + - type: volume + source: forest-data + target: ${FOREST_TARGET_DATA} + - type: volume + source: sync-check + target: ${FOREST_TARGET_SCRIPTS} + command: + - '--chain' + - 'mainnet' + - '--config' + - ${FOREST_TARGET_SCRIPTS}/sync_check.toml + - '--auto-download-snapshot' + - '--save-token' + - '/tmp/admin_token' + environment: + FOREST_GC_TRIGGER_FACTOR: "1.4" + restart: unless-stopped + labels: + com.centurylinklabs.watchtower.enable: true + forest_calibnet: + image: ghcr.io/chainsafe/forest:${FOREST_TAG} + hostname: forest-calibnet + container_name: forest-calibnet + networks: + - calibnet + volumes: + - type: volume + source: forest-data + target: ${FOREST_TARGET_DATA} + - type: volume + source: sync-check + target: ${FOREST_TARGET_SCRIPTS} + command: + - '--chain' + - 'calibnet' + - '--config' + - ${FOREST_TARGET_SCRIPTS}/sync_check.toml + - '--auto-download-snapshot' + - '--save-token' + - '/tmp/admin_token' + environment: + FOREST_GC_TRIGGER_FACTOR: "1.2" + restart: unless-stopped + labels: + com.centurylinklabs.watchtower.enable: true + # Probe container to validate Forest syncing. Needs to be on the same network. + forest_tester: + build: + context: . + dockerfile: Dockerfile-tester + container_name: forest-tester + privileged: true + networks: + - mainnet + - calibnet + volumes: + - type: volume + source: forest-data + target: ${FOREST_TARGET_DATA} + - type: volume + read_only: true + source: sync-check + target: ${FOREST_TARGET_SCRIPTS} + # Put common Ruby utils into a path that should be by default in Ruby PATH + - type: volume + read_only: true + source: ruby-common + target: /usr/local/share/ruby/site_ruby/cs_utils + - /var/run/docker.sock:/var/run/docker.sock + environment: + - LOG_DIR=${FOREST_TARGET_DATA} + - SCRIPTS_DIR=${FOREST_TARGET_SCRIPTS} + - FOREST_SLACK_API_TOKEN=${FOREST_SLACK_API_TOKEN} + - FOREST_SLACK_NOTIF_CHANNEL=${FOREST_SLACK_NOTIF_CHANNEL} + - FOREST_TARGET_DATA=${FOREST_TARGET_DATA} + - FOREST_TARGET_SCRIPTS=${FOREST_TARGET_SCRIPTS} + - FOREST_TAG=${FOREST_TAG} + entrypoint: ["/bin/bash", "-c"] + command: + - | + ruby ${FOREST_TARGET_SCRIPTS}/sync_check.rb forest-mainnet & + ruby ${FOREST_TARGET_SCRIPTS}/sync_check.rb forest-calibnet & + wait + sleep infinity + depends_on: + - forest_mainnet + - forest_calibnet + restart: unless-stopped + labels: + com.centurylinklabs.watchtower.enable: true + com.centurylinklabs.watchtower.depends-on: "forest-mainnet,forest-calibnet" + +volumes: + forest-data: + external: true + name: forest-data + sync-check: + external: true + name: sync-check + ruby-common: + external: true + name: ruby-common + +networks: + mainnet: + calibnet: diff --git a/tf-managed/modules/sync-check/service/forest-env.tpl b/tf-managed/modules/sync-check/service/forest-env.tpl new file mode 100644 index 000000000..4de4892be --- /dev/null +++ b/tf-managed/modules/sync-check/service/forest-env.tpl @@ -0,0 +1,9 @@ +export FOREST_TARGET_DATA="${FOREST_TARGET_DATA}" +export FOREST_TARGET_SCRIPTS="${FOREST_TARGET_SCRIPTS}" +export FOREST_TARGET_RUBY_COMMON="${FOREST_TARGET_RUBY_COMMON}" +export FOREST_SLACK_API_TOKEN="${slack_token}" +export FOREST_SLACK_NOTIF_CHANNEL="${slack_channel}" +export NEW_RELIC_API_KEY="${NEW_RELIC_API_KEY}" +export NEW_RELIC_ACCOUNT_ID="${NEW_RELIC_ACCOUNT_ID}" +export NEW_RELIC_REGION="${NEW_RELIC_REGION}" +export FOREST_TAG="${forest_tag}" diff --git a/tf-managed/modules/sync-check/service/health_check.sh b/tf-managed/modules/sync-check/service/health_check.sh new file mode 100755 index 000000000..1cfe8bd8c --- /dev/null +++ b/tf-managed/modules/sync-check/service/health_check.sh @@ -0,0 +1,112 @@ +#!/bin/bash + +# Script to check health status of a running node. +# The only prerequisite here is that the `forest` process is running. +# The script will wait till metrics endpoint becomes available. +# Input: Forest hostname + +# Exit codes +RET_OK=0 +RET_SYNC_TIPSET_STALE=1 +RET_SYNC_ERROR=2 +RET_SYNC_TIMEOUT=3 +RET_HOSTNAME_NOT_SET=4 + +if [ $# -eq 0 ]; then + echo "No arguments supplied. Need to provide Forest hostname, e.g. forest-mainnet." + exit "$RET_HOSTNAME_NOT_SET" +fi + +# Governs how long the health check will run to assert Forest condition +HEALTH_CHECK_DURATION_SECONDS=${HEALTH_CHECK_DURATION_SECONDS:-"360"} +# Forest metrics endpoint path +FOREST_METRICS_ENDPOINT=${FOREST_METRICS_ENDPOINT:-"http://$1:6116/metrics"} +# Initial sync timeout (in seconds) after which the health check will fail +HEALTH_CHECK_SYNC_TIMEOUT_SECONDS=${HEALTH_CHECK_SYNC_TIMEOUT_SECONDS:-"7200"} + +# Extracts metric value from the metric data +# Arg: name of the metric +function get_metric_value() { + grep -E "^$1" <<< "$metrics" | cut -d' ' -f2 +} + +# Updates metrics data with the latest metrics from Prometheus +# Arg: none +function update_metrics() { + metrics=$(curl --silent "$FOREST_METRICS_ENDPOINT") +} + +# Checks if an error occurred and is visible in the metrics. +# Arg 1: name of the error metric +# Arg 2: maximum number of occurrences for the assertion to pass (0 for strictly not pass) +function assert_error() { + errors="$(get_metric_value "$1")" + if [[ "$errors" -gt "$2" ]]; then + echo "❌ $1: $errors (max: $2)" + ret=$RET_SYNC_ERROR + fi +} + +##### Actual script + +# Wait for Forest to start syncing +# Excluding `tipset_start` from the unbound variable check +set +u +timeout="$HEALTH_CHECK_SYNC_TIMEOUT_SECONDS" +echo "⏳ Waiting for Forest to start syncing (up to $timeout seconds)..." +until [ -n "$tipset_start" ] || [ "$timeout" -le 0 ] +do + update_metrics + tipset_start="$(get_metric_value "last_validated_tipset_epoch")" + sleep 1 + timeout=$((timeout-1)) +done +# Re-enabling the unbound variable check +set -u + +if [ "$timeout" -le 0 ]; then + echo "❌ Timed out on sync wait" + exit "$RET_SYNC_TIMEOUT" +fi +echo "✅ Forest started syncing" + +# Let Forest run for the health check period +echo "⏳ Waiting for the health probe to finish..." +sleep "$HEALTH_CHECK_DURATION_SECONDS" + +# Grab last synced tipset epoch +update_metrics +tipset_end="$(get_metric_value "last_validated_tipset_epoch")" + +if [ -z "$tipset_end" ]; then + echo "❌ Did not manage to get sync status" + exit "$RET_SYNC_ERROR" +fi + +# Assert tipset epoch moved forward +echo "👉 Tipset start: $tipset_start, end: $tipset_end" +if [ "$tipset_end" -gt "$tipset_start" ]; then + echo "✅ Tipset epoch moving forward" + ret="$RET_OK" +else + echo "❌ Tipset epoch didn't move forward." + ret="$RET_SYNC_TIPSET_STALE" +fi + +# Assert there are no sync errors +assert_error "network_head_evaluation_errors" 0 +assert_error "bootstrap_errors" 2 +assert_error "follow_network_interruptions" 0 +assert_error "follow_network_errors" 0 + +if [ "$ret" -ne "$RET_SYNC_ERROR" ]; then + echo "✅ No sync errors" +fi + +if [ "$ret" -eq "$RET_OK" ]; then + echo "✅ Health check passed" +else + echo "❌ Health check failed" +fi + +exit "$ret" diff --git a/tf-managed/modules/sync-check/service/init.sh b/tf-managed/modules/sync-check/service/init.sh new file mode 100755 index 000000000..d96561371 --- /dev/null +++ b/tf-managed/modules/sync-check/service/init.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +## Enable strict error handling, command tracing, and pipefail +set -eux + +# Wait for cloud-init to finish initializing the machine +cloud-init status --wait + +# Setting DEBIAN_FRONTEND to ensure non-interactive operations for APT +export DEBIAN_FRONTEND=noninteractive + +# Using timeout to ensure the script retries if the APT servers are temporarily unavailable. +timeout 10m bash -c 'until apt-get -qqq --yes update && \ + apt-get -qqq --yes install ruby ruby-dev gcc make; do sleep 10; \ +done' + +gem install slack-ruby-client sys-filesystem + +nohup /bin/bash ./run_service.sh > run_service_log.txt & + +if [ -n "$NEW_RELIC_API_KEY" ] ; then + curl -Ls https://download.newrelic.com/install/newrelic-cli/scripts/install.sh | bash && \ + sudo NEW_RELIC_API_KEY="$NEW_RELIC_API_KEY" \ + NEW_RELIC_ACCOUNT_ID="$NEW_RELIC_ACCOUNT_ID" \ + NEW_RELIC_REGION="$NEW_RELIC_REGION" \ + /usr/local/bin/newrelic install -y + +# The provided configurations are specific to New Relic. To gain a deeper understanding of these configuration details, you can visit: +# https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent/configuration/infrastructure-agent-configuration-settings/#offline-time-to-reset +cat >> /etc/newrelic-infra.yml < /etc/newrelic-infra/logging.d/logging.yml < /dev/null || true +docker container rm --force forest-calibnet 2> /dev/null || true +docker container rm --force forest-mainnet 2> /dev/null || true +docker container rm --force forest-tester 2> /dev/null || true + +## Ensure watchtower is running +docker stop watchtower 2> /dev/null || true +docker wait watchtower 2> /dev/null || true +docker run \ + --detach \ + --restart unless-stopped \ + --privileged \ + -v /var/run/docker.sock:/var/run/docker.sock \ + --name watchtower \ + containrrr/watchtower \ + --label-enable --include-stopped --revive-stopped --stop-timeout 120s --interval 600 + +## We need it to access the DATA_DIR regardless of the user. +chmod 0777 /var/lib/docker/volumes/forest-data/_data + +## Ensure volumes are clean +rm -rf /var/lib/docker/volumes/forest-data/_data/* +rm -rf /var/lib/docker/volumes/sync-check/_data/* +rm -rf /var/lib/docker/volumes/ruby-common/_data/* + +## Copy all relevant scripts +cp --recursive /root/* /var/lib/docker/volumes/sync-check/_data/ +cp --recursive /root/ruby_common/* /var/lib/docker/volumes/ruby-common/_data/ + +## Run health check status of a running node +ruby sync_check_process.rb diff --git a/tf-managed/modules/sync-check/service/sync_check.rb b/tf-managed/modules/sync-check/service/sync_check.rb new file mode 100755 index 000000000..eb24ab777 --- /dev/null +++ b/tf-managed/modules/sync-check/service/sync_check.rb @@ -0,0 +1,59 @@ +# frozen_string_literal: true + +require_relative 'ruby_common/slack_client' +require_relative 'ruby_common/docker_utils' +require_relative 'ruby_common/utils' +require_relative 'sync_check_process' +require 'logger' +require 'fileutils' + +# Retrieves an environmental variable, failing if its not set or empty. +def get_and_assert_env_variable(name) + var = ENV[name] + raise "Please set #{name} environmental variable" if var.nil? || var.empty? + + var +end + +SLACK_TOKEN = get_and_assert_env_variable 'FOREST_SLACK_API_TOKEN' +CHANNEL = get_and_assert_env_variable 'FOREST_SLACK_NOTIF_CHANNEL' +SCRIPTS_DIR = get_and_assert_env_variable 'SCRIPTS_DIR' +LOG_DIR = get_and_assert_env_variable 'LOG_DIR' + +hostname = ARGV[0] +raise 'No arguments supplied. Please provide Forest hostname, e.g. forest-mainnet' if ARGV.empty? + +network = hostname.match(/-(\w+)$/)[1] + +# Current datetime, to append to the log files +DATE = Time.new.strftime '%FT%H:%M:%S' +LOG_HEALTH = "#{LOG_DIR}/#{hostname}_#{DATE}_health" +LOG_FOREST = "#{LOG_DIR}/#{hostname}_#{DATE}_forest" +LOG_SYNC = "#{LOG_DIR}/#{hostname}_#{DATE}_sync" + +# Create log directory +FileUtils.mkdir_p LOG_DIR + +logger = Logger.new(LOG_SYNC) + +# Run the actual health check +logger.info 'Running the health check...' +health_check_passed = system("bash #{SCRIPTS_DIR}/health_check.sh #{hostname} > #{LOG_HEALTH} 2>&1") +logger.info 'Health check finished' + +# Save the log capture from the Forest container +container_logs = DockerUtils.get_container_logs hostname +File.write(LOG_FOREST, container_logs) + +client = SlackClient.new CHANNEL, SLACK_TOKEN + +if health_check_passed + client.post_message "✅ Sync check for #{hostname} passed. 🌲🌳🌲🌳🌲" +else + client.post_message "⛔ Sync check for #{hostname} fiascoed. 🔥🌲🔥" + SyncCheck.new.run_forest_tool("db destroy --chain #{network} --force") + logger.info 'DB Destroyed' +end +client.attach_files(LOG_HEALTH, LOG_SYNC, LOG_FOREST) + +logger.info 'Sync check finished' diff --git a/tf-managed/modules/sync-check/service/sync_check.toml b/tf-managed/modules/sync-check/service/sync_check.toml new file mode 100644 index 000000000..fe5e5b082 --- /dev/null +++ b/tf-managed/modules/sync-check/service/sync_check.toml @@ -0,0 +1,4 @@ +[client] +data_dir = "/volumes/forest_data" +encrypt_keystore = false +metrics_address = "0.0.0.0:6116" diff --git a/tf-managed/modules/sync-check/service/sync_check_process.rb b/tf-managed/modules/sync-check/service/sync_check_process.rb new file mode 100755 index 000000000..48cfc25ac --- /dev/null +++ b/tf-managed/modules/sync-check/service/sync_check_process.rb @@ -0,0 +1,123 @@ +# frozen_string_literal: true + +require_relative 'ruby_common/slack_client' +require_relative 'ruby_common/utils' + +require 'English' +require 'fileutils' +require 'sys/filesystem' +require 'logger' +require 'open3' + +SLACK_TOKEN = get_and_assert_env_variable 'FOREST_SLACK_API_TOKEN' +CHANNEL = get_and_assert_env_variable 'FOREST_SLACK_NOTIF_CHANNEL' +FOREST_DATA = get_and_assert_env_variable 'FOREST_TARGET_DATA' +FOREST_SCRIPTS = get_and_assert_env_variable 'FOREST_TARGET_SCRIPTS' +FOREST_TAG = get_and_assert_env_variable 'FOREST_TAG' + +# Sync check class encompassing all required methods and fields +class SyncCheck + def initialize + @logger = Logger.new($stdout) + @client = SlackClient.new CHANNEL, SLACK_TOKEN + end + + # Runs a command with an arbitrary binary available in the chainsafe/forest image + def run_forest_container(binary, command) + @logger.debug "Running `#{binary}` command with #{command}" + stdout, stderr, status = Open3.capture3("docker run --entrypoint #{binary} \ + --init \ + --volume forest-data:#{FOREST_DATA} \ + --volume sync-check:#{FOREST_SCRIPTS} \ + --rm \ + ghcr.io/chainsafe/forest:#{FOREST_TAG} \ + --config #{FOREST_SCRIPTS}/sync_check.toml \ + #{command}") + raise "Failed `#{binary} #{command}`.\n```\nSTDOUT:\n#{stdout}\nSTDERR:\n#{stderr}```" unless status.success? + end + + # Runs a command for forest-tool. The configuration is pre-defined. + def run_forest_tool(command) + run_forest_container('forest-tool', command) + end + + # Runs a command for forest node. The configuration is pre-defined. + def run_forest(command) + run_forest_container('forest', command) + end + + # Gets current disk usage. + def disk_usage + stat = Sys::Filesystem.stat('/') + 1 - stat.blocks_available.fdiv(stat.blocks) + end + + # Starts docker compose services. + def start_services + @logger.info 'Starting services' + `docker compose up --build --force-recreate --detach` + raise 'Failed to start services' unless $CHILD_STATUS.success? + end + + # Stops docker compose services + def stop_services + @logger.info 'Stopping services' + `docker compose down` + raise 'Failed to stop services' unless $CHILD_STATUS.success? + end + + # Checks if the docker compose services are up + def services_up? + output = `docker compose ps --services --filter "status=running"` + $CHILD_STATUS.success? && !output.strip.empty? + end + + # logs and sends a slack message containing the error description + def report_error(error) + @logger.error error.message + @client.post_message '💀 Sync check fiasco ❌' + @client.attach_comment error.message + end + + # Cleans up the sync check + def cleanup + @logger.info 'Cleaning up sync check' + @client.post_message '🧹 Cleaning up sync check' + + stop_services + cleanup_command = "docker run --rm --volume forest-data:#{FOREST_DATA} busybox sh -c 'rm -rf #{FOREST_DATA}/**'" + + stdout, stderr, status = Open3.capture3(cleanup_command) + unless status.success? + error_message = "Cleanup failed with status: #{status.exitstatus}. STDOUT: #{stdout}, STDERR: #{stderr}" + @logger.error error_message + @client.attach_comment "Cleanup error: #{error_message}" + raise 'Failed to clean up Docker volume' + else + @logger.info 'Cleanup successful' + @client.attach_comment '🧹 Docker volume cleanup completed successfully ✅' + end + + @client.attach_comment '🧹 Cleanup finished ✅' + end + + # start the sync check loop + def run + loop do + begin + `docker image prune -f` + cleanup unless disk_usage < 0.85 + start_services unless services_up? + rescue StandardError => e + report_error e + end + + # sleep 1 hour before checking again + sleep 60 * 60 + end + end +end + +##### +# Runs only when executed directly +SyncCheck.new.run if __FILE__ == $PROGRAM_NAME diff --git a/tf-managed/modules/sync-check/variables.tf b/tf-managed/modules/sync-check/variables.tf new file mode 100644 index 000000000..f953b60c0 --- /dev/null +++ b/tf-managed/modules/sync-check/variables.tf @@ -0,0 +1,68 @@ +variable "digitalocean_token" { + description = "Token for authentication." + type = string + sensitive = true +} + +variable "name" { + description = "The name of Forest Droplet" + type = string +} + +variable "size" { + description = "The size of the droplet instance to launch" + type = string +} + +variable "slack_channel" { + description = "slack channel name for notifications" + type = string +} + +variable "slack_token" { + description = "slack access token" + type = string + sensitive = true +} + +variable "image" { + description = "The ID of the AMI to use for the Droplet" + type = string + default = "docker-20-04" +} + +variable "region" { + description = "The region where resources will be created" + type = string + default = "fra1" +} + +variable "project" { + description = "DigitalOcean project used as parent for the created droplet" + type = string + default = "Forest-DEV" # Alternative: "Default" +} + +variable "NEW_RELIC_REGION" { + description = "The New Relic Platform Region" + type = string + default = "EU" +} + +variable "NEW_RELIC_API_KEY" { + description = "New Relic API KEY" + default = "" + type = string + sensitive = true +} + +variable "NEW_RELIC_ACCOUNT_ID" { + description = "The New Relic Account ID" + default = "" + type = string + sensitive = true +} + +variable "common_resources_dir" { + type = string +} From 5ed4d32f7e3511518c20c30a1aa22f5ea5bbceec Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 7 Dec 2023 16:50:39 +0000 Subject: [PATCH 02/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tf-managed/.gitignore | 4 ++-- tf-managed/live/terragrunt.hcl | 6 +++--- tf-managed/modules/sync-check/main.tf | 1 - 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/tf-managed/.gitignore b/tf-managed/.gitignore index 9b8a46e69..6304eb3c1 100644 --- a/tf-managed/.gitignore +++ b/tf-managed/.gitignore @@ -10,8 +10,8 @@ crash.log crash.*.log # Exclude all .tfvars files, which are likely to contain sensitive data, such as -# password, private keys, and other secrets. These should not be part of version -# control as they are data points which are potentially sensitive and subject +# password, private keys, and other secrets. These should not be part of version +# control as they are data points which are potentially sensitive and subject # to change depending on the environment. *.tfvars *.tfvars.json diff --git a/tf-managed/live/terragrunt.hcl b/tf-managed/live/terragrunt.hcl index b0f6523aa..7e73413bc 100644 --- a/tf-managed/live/terragrunt.hcl +++ b/tf-managed/live/terragrunt.hcl @@ -1,6 +1,6 @@ locals { - # Parse the file path we're in to read the env name: e.g., env - # will be "dev" in the dev folder, "stage" in the stage folder, + # Parse the file path we're in to read the env name: e.g., env + # will be "dev" in the dev folder, "stage" in the stage folder, # etc. parsed = regex(".*/environments/(?P.*?)/.*", get_terragrunt_dir()) env = local.parsed.env @@ -15,7 +15,7 @@ remote_state { } config = { // if the environment is dev, use the dev bucket, otherwise use the prod bucket - bucket = (local.env == "prod" + bucket = (local.env == "prod" ? "hubert-bucket-prod" : "hubert-bucket-dev" ) diff --git a/tf-managed/modules/sync-check/main.tf b/tf-managed/modules/sync-check/main.tf index a323f3ea3..0b8a3a661 100644 --- a/tf-managed/modules/sync-check/main.tf +++ b/tf-managed/modules/sync-check/main.tf @@ -124,4 +124,3 @@ resource "digitalocean_project_resources" "connect_forest_project" { project = data.digitalocean_project.forest_project.id resources = [digitalocean_droplet.forest.urn] } - From 95d3fb4a5acf922a45295b4112a04196ecbcc2bb Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Fri, 8 Dec 2023 12:05:00 +0100 Subject: [PATCH 03/46] dry env --- tf-managed/live/environments/dev/sync-check/terragrunt.hcl | 2 +- tf-managed/live/environments/prod/sync-check/terragrunt.hcl | 5 ++--- tf-managed/live/terragrunt.hcl | 1 + tf-managed/modules/sync-check/main.tf | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tf-managed/live/environments/dev/sync-check/terragrunt.hcl b/tf-managed/live/environments/dev/sync-check/terragrunt.hcl index e4ea12461..acee94459 100644 --- a/tf-managed/live/environments/dev/sync-check/terragrunt.hcl +++ b/tf-managed/live/environments/dev/sync-check/terragrunt.hcl @@ -10,6 +10,6 @@ terraform { } inputs = { - name = "hubert-sync-check-dev" # TODO get environment from terragrunt + name = "hubert-sync-check" size = "s-4vcpu-16gb-amd" } diff --git a/tf-managed/live/environments/prod/sync-check/terragrunt.hcl b/tf-managed/live/environments/prod/sync-check/terragrunt.hcl index 0e1a2da61..acee94459 100644 --- a/tf-managed/live/environments/prod/sync-check/terragrunt.hcl +++ b/tf-managed/live/environments/prod/sync-check/terragrunt.hcl @@ -10,7 +10,6 @@ terraform { } inputs = { - # Configure service: - name = "hubert-sync-check-prod" # TODO get environment from terragrunt - size = "s-4vcpu-16gb-amd" # droplet size + name = "hubert-sync-check" + size = "s-4vcpu-16gb-amd" } diff --git a/tf-managed/live/terragrunt.hcl b/tf-managed/live/terragrunt.hcl index 7e73413bc..058610e7c 100644 --- a/tf-managed/live/terragrunt.hcl +++ b/tf-managed/live/terragrunt.hcl @@ -42,4 +42,5 @@ remote_state { inputs = { common_resources_dir = format("%s/../common", get_parent_terragrunt_dir()) slack_channel = (local.env == "prod" ? "#forest-notifications" : "#forest-dump") + env = local.env } diff --git a/tf-managed/modules/sync-check/main.tf b/tf-managed/modules/sync-check/main.tf index 0b8a3a661..26965bc60 100644 --- a/tf-managed/modules/sync-check/main.tf +++ b/tf-managed/modules/sync-check/main.tf @@ -86,7 +86,7 @@ locals { resource "digitalocean_droplet" "forest" { image = var.image - name = var.name + name = format("%s-%s", var.env, var.name) region = var.region size = var.size # Re-initialize resource if this hash changes: From c96a7166029eb6ee6b0abcbdd5d25a62354d7d36 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Fri, 8 Dec 2023 12:11:52 +0100 Subject: [PATCH 04/46] hclfmt --- tf-managed/live/environments/dev/sync-check/terragrunt.hcl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tf-managed/live/environments/dev/sync-check/terragrunt.hcl b/tf-managed/live/environments/dev/sync-check/terragrunt.hcl index acee94459..488ea1f54 100644 --- a/tf-managed/live/environments/dev/sync-check/terragrunt.hcl +++ b/tf-managed/live/environments/dev/sync-check/terragrunt.hcl @@ -10,6 +10,6 @@ terraform { } inputs = { - name = "hubert-sync-check" - size = "s-4vcpu-16gb-amd" + name = "hubert-sync-check" + size = "s-4vcpu-16gb-amd" } From 18e464061b88308bc65ffa18d3f0ed725acd0789 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Fri, 8 Dec 2023 13:02:58 +0100 Subject: [PATCH 05/46] prod fix --- tf-managed/live/terragrunt.hcl | 3 --- tf-managed/modules/sync-check/variables.tf | 5 +++++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tf-managed/live/terragrunt.hcl b/tf-managed/live/terragrunt.hcl index 058610e7c..6ee0099d9 100644 --- a/tf-managed/live/terragrunt.hcl +++ b/tf-managed/live/terragrunt.hcl @@ -22,9 +22,6 @@ remote_state { key = "${local.env}-terraform.tfstate" region = "eu-west-1" endpoint = "https://fra1.digitaloceanspaces.com" - //endpoints = { - // s3 = "https://fra1.digitaloceanspaces.com" - //} skip_bucket_versioning = true skip_bucket_ssencryption = true skip_bucket_root_access = true diff --git a/tf-managed/modules/sync-check/variables.tf b/tf-managed/modules/sync-check/variables.tf index f953b60c0..4b3cc1db6 100644 --- a/tf-managed/modules/sync-check/variables.tf +++ b/tf-managed/modules/sync-check/variables.tf @@ -66,3 +66,8 @@ variable "NEW_RELIC_ACCOUNT_ID" { variable "common_resources_dir" { type = string } + +variable "env" { + description = "The environment name" + type = string +} From a5b321f6e51e6f893b2861b7b8bbefc086163314 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Tue, 12 Dec 2023 14:04:04 +0100 Subject: [PATCH 06/46] get env from root --- .../snapshot-service/terragrunt.hcl | 15 ++ .../sync-check/terragrunt.hcl | 4 +- .../sync-check/terragrunt.hcl | 6 +- tf-managed/modules/daily-snapshot/main.tf | 176 ++++++++++++++++++ .../modules/daily-snapshot/prep_sources.sh | 14 ++ .../daily-snapshot/service/calibnet_cron_job | 7 + .../daily-snapshot/service/daily_snapshot.rb | 68 +++++++ .../daily-snapshot/service/forest-env.tpl | 11 ++ .../modules/daily-snapshot/service/init.sh | 41 ++++ .../daily-snapshot/service/mainnet_cron_job | 7 + .../service/newrelic_fail2ban.sh | 55 ++++++ .../service/upload_filops_snapshot.sh | 65 +++++++ .../daily-snapshot/service/upload_snapshot.sh | 114 ++++++++++++ tf-managed/modules/daily-snapshot/variable.tf | 116 ++++++++++++ tf-managed/modules/sync-check/main.tf | 4 +- tf-managed/modules/sync-check/variables.tf | 2 +- 16 files changed, 697 insertions(+), 8 deletions(-) create mode 100644 tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl rename tf-managed/live/environments/dev/{ => applications}/sync-check/terragrunt.hcl (87%) rename tf-managed/live/environments/prod/{ => applications}/sync-check/terragrunt.hcl (74%) create mode 100644 tf-managed/modules/daily-snapshot/main.tf create mode 100755 tf-managed/modules/daily-snapshot/prep_sources.sh create mode 100755 tf-managed/modules/daily-snapshot/service/calibnet_cron_job create mode 100644 tf-managed/modules/daily-snapshot/service/daily_snapshot.rb create mode 100644 tf-managed/modules/daily-snapshot/service/forest-env.tpl create mode 100755 tf-managed/modules/daily-snapshot/service/init.sh create mode 100755 tf-managed/modules/daily-snapshot/service/mainnet_cron_job create mode 100644 tf-managed/modules/daily-snapshot/service/newrelic_fail2ban.sh create mode 100644 tf-managed/modules/daily-snapshot/service/upload_filops_snapshot.sh create mode 100755 tf-managed/modules/daily-snapshot/service/upload_snapshot.sh create mode 100644 tf-managed/modules/daily-snapshot/variable.tf diff --git a/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl b/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl new file mode 100644 index 000000000..6090cfe9c --- /dev/null +++ b/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl @@ -0,0 +1,15 @@ +# Automatically find the root terragrunt.hcl and inherit its +# configuration +include "root" { + path = find_in_parent_folders() +} + +# Load the actual Terraform module +terraform { + source = format("%s/../modules/daily-snapshot", get_parent_terragrunt_dir()) +} + +inputs = { + name = "forest-snapshot" + size = "s-4vcpu-16gb-amd" +} diff --git a/tf-managed/live/environments/dev/sync-check/terragrunt.hcl b/tf-managed/live/environments/dev/applications/sync-check/terragrunt.hcl similarity index 87% rename from tf-managed/live/environments/dev/sync-check/terragrunt.hcl rename to tf-managed/live/environments/dev/applications/sync-check/terragrunt.hcl index 488ea1f54..d8961db58 100644 --- a/tf-managed/live/environments/dev/sync-check/terragrunt.hcl +++ b/tf-managed/live/environments/dev/applications/sync-check/terragrunt.hcl @@ -1,6 +1,6 @@ # Automatically find the root terragrunt.hcl and inherit its # configuration -include { +include "root" { path = find_in_parent_folders() } @@ -10,6 +10,6 @@ terraform { } inputs = { - name = "hubert-sync-check" + name = "sync-check" size = "s-4vcpu-16gb-amd" } diff --git a/tf-managed/live/environments/prod/sync-check/terragrunt.hcl b/tf-managed/live/environments/prod/applications/sync-check/terragrunt.hcl similarity index 74% rename from tf-managed/live/environments/prod/sync-check/terragrunt.hcl rename to tf-managed/live/environments/prod/applications/sync-check/terragrunt.hcl index acee94459..d8961db58 100644 --- a/tf-managed/live/environments/prod/sync-check/terragrunt.hcl +++ b/tf-managed/live/environments/prod/applications/sync-check/terragrunt.hcl @@ -1,6 +1,6 @@ # Automatically find the root terragrunt.hcl and inherit its # configuration -include { +include "root" { path = find_in_parent_folders() } @@ -10,6 +10,6 @@ terraform { } inputs = { - name = "hubert-sync-check" - size = "s-4vcpu-16gb-amd" + name = "sync-check" + size = "s-4vcpu-16gb-amd" } diff --git a/tf-managed/modules/daily-snapshot/main.tf b/tf-managed/modules/daily-snapshot/main.tf new file mode 100644 index 000000000..f83798ead --- /dev/null +++ b/tf-managed/modules/daily-snapshot/main.tf @@ -0,0 +1,176 @@ +# This terraform script executes the following steps: +# - Zip the ruby and shell script files (the hash of this zip file is used to +# determine when to re-deploy the service) +# - Boot a new droplet +# - Copy over the zip file +# - Run the init.sh script in the background + +terraform { + required_version = "~> 1.3" + + required_providers { + digitalocean = { + source = "digitalocean/digitalocean" + version = "~> 2.0" + } + external = { + source = "hashicorp/external" + version = "~> 2.1" + } + local = { + source = "hashicorp/local" + version = "~> 2.1" + } + + } +} + +provider "digitalocean" { + token = var.digitalocean_token +} + +// Ugly hack because 'archive_file' cannot mix files and folders. +data "external" "sources_tar" { + program = ["sh", "${path.module}/prep_sources.sh", path.module] +} + + +data "local_file" "sources" { + filename = data.external.sources_tar.result.path +} + +// Note: The init.sh file is also included in the sources.zip such that the hash +// of the archive captures the entire state of the machine. +// This is a workaround, and because of this, we need to suppress the tflint warning here +// for unused declarations related to the 'init.sh' file. +// tflint-ignore: terraform_unused_declarations +data "local_file" "init" { + filename = "${path.module}/service/init.sh" +} + +data "digitalocean_ssh_keys" "keys" { + sort { + key = "name" + direction = "asc" + } +} + +# Set required environment variables +locals { + env_content = templatefile("${path.module}/service/forest-env.tpl", { + R2_ACCESS_KEY = var.R2_ACCESS_KEY, + R2_SECRET_KEY = var.R2_SECRET_KEY, + r2_endpoint = var.r2_endpoint, + slack_token = var.slack_token, + slack_channel = var.slack_channel, + snapshot_bucket = var.snapshot_bucket, + snapshot_endpoint = var.snapshot_endpoint, + NEW_RELIC_API_KEY = var.NEW_RELIC_API_KEY, + NEW_RELIC_ACCOUNT_ID = var.NEW_RELIC_ACCOUNT_ID, + NEW_RELIC_REGION = var.NEW_RELIC_REGION, + BASE_FOLDER = "/root", + forest_tag = var.forest_tag + }) +} + +locals { + init_commands = ["cd /root/", + "tar xf sources.tar", + # Set required environment variables + "echo '${local.env_content}' >> /root/.forest_env", + "echo '. ~/.forest_env' >> .bashrc", + ". ~/.forest_env", + "nohup sh ./init.sh > init_log.txt &", + # Exiting without a sleep sometimes kills the script :-/ + "sleep 60s" + ] +} + +resource "digitalocean_droplet" "forest" { + image = var.image + name = format("%s-%s", var.environment, var.name) + region = var.region + size = var.size + # Re-initialize resource if this hash changes: + user_data = join("-", [data.local_file.sources.content_sha256, sha256(join("", local.init_commands))]) + tags = ["iac", var.environment] + ssh_keys = data.digitalocean_ssh_keys.keys.ssh_keys[*].fingerprint + monitoring = true + + graceful_shutdown = false + + connection { + host = self.ipv4_address + user = "root" + type = "ssh" + } + + # Push the sources.tar file to the newly booted droplet + provisioner "file" { + source = data.local_file.sources.filename + destination = "/root/sources.tar" + } + + provisioner "remote-exec" { + inline = local.init_commands + } +} + + +data "digitalocean_project" "forest_project" { + name = var.project +} + +# Connect the droplet to the forest project (otherwise it ends up in +# "ChainBridge" which is the default project) +resource "digitalocean_project_resources" "connect_forest_project" { + project = data.digitalocean_project.forest_project.id + resources = [digitalocean_droplet.forest.urn] +} + +resource "digitalocean_firewall" "forest-firewall" { + name = var.name + + inbound_rule { + protocol = "tcp" + port_range = "22" + source_addresses = var.source_addresses + } + + inbound_rule { + protocol = "tcp" + port_range = "2345" + source_addresses = var.source_addresses + } + + inbound_rule { + protocol = "tcp" + port_range = "80" + source_addresses = var.source_addresses + } + + inbound_rule { + protocol = "udp" + port_range = "53" + source_addresses = var.source_addresses + } + + outbound_rule { + protocol = "tcp" + port_range = "all" + destination_addresses = var.destination_addresses + } + + outbound_rule { + protocol = "udp" + port_range = "53" + destination_addresses = var.destination_addresses + } + + droplet_ids = [digitalocean_droplet.forest.id] +} + +# This ip address may be used in the future by monitoring software +output "ip" { + value = [digitalocean_droplet.forest.ipv4_address] +} diff --git a/tf-managed/modules/daily-snapshot/prep_sources.sh b/tf-managed/modules/daily-snapshot/prep_sources.sh new file mode 100755 index 000000000..6324e5da4 --- /dev/null +++ b/tf-managed/modules/daily-snapshot/prep_sources.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# Enable strict error handling and command tracing +set -euxo pipefail + +# Copy local source files in a folder together with ruby_common and create a zip archive. + +cd "$1" +cp --archive $2/ruby_common service/ + +rm -f sources.tar +(cd service && tar cf ../sources.tar --sort=name --mtime='UTC 2019-01-01' ./* > /dev/null 2>&1) +rm -fr service/ruby_common +echo "{ \"path\": \"$1/sources.tar\" }" diff --git a/tf-managed/modules/daily-snapshot/service/calibnet_cron_job b/tf-managed/modules/daily-snapshot/service/calibnet_cron_job new file mode 100755 index 000000000..a492ad45d --- /dev/null +++ b/tf-managed/modules/daily-snapshot/service/calibnet_cron_job @@ -0,0 +1,7 @@ +#!/bin/bash + +# shellcheck source=/dev/null +source ~/.forest_env +cd "$BASE_FOLDER" || exit +flock -n /tmp/calibnet.lock -c "ruby daily_snapshot.rb calibnet > logs/calibnet_log.txt 2>&1" +flock -n /tmp/calibnet_filops.lock -c "./upload_filops_snapshot.sh calibnet > logs/filops_calibnet_log.txt 2>&1" diff --git a/tf-managed/modules/daily-snapshot/service/daily_snapshot.rb b/tf-managed/modules/daily-snapshot/service/daily_snapshot.rb new file mode 100644 index 000000000..fa8e3ff29 --- /dev/null +++ b/tf-managed/modules/daily-snapshot/service/daily_snapshot.rb @@ -0,0 +1,68 @@ +# frozen_string_literal: true + +require_relative 'ruby_common/slack_client' +require_relative 'ruby_common/docker_utils' +require_relative 'ruby_common/utils' + +require 'date' +require 'logger' +require 'fileutils' +require 'active_support/time' + +BASE_FOLDER = get_and_assert_env_variable 'BASE_FOLDER' +SLACK_TOKEN = get_and_assert_env_variable 'SLACK_API_TOKEN' +CHANNEL = get_and_assert_env_variable 'SLACK_NOTIF_CHANNEL' + +# Query the date of the most recent snapshot. +def latest_snapshot_date(chain_name = 'calibnet') + # We do not support HEAD requests but we _do_ support empty ranges. + filename = `curl --remote-name --remote-header-name --write-out "%{filename_effective}" --silent https://forest-archive.chainsafe.dev/latest/#{chain_name}/ -H "Range: bytes=0-0"` + # Curl will create a file with a single byte in it. Let's clean it up. + File.delete(filename) + snapshot_format = /^([^_]+?)_snapshot_(?[^_]+?)_(?\d{4}-\d{2}-\d{2})_height_(?\d+)(\.forest)?\.car.zst$/ + filename.match(snapshot_format) do |m| + m[:date].to_date + end +end + +CHAIN_NAME = ARGV[0] +raise 'No chain name supplied. Please provide chain identifier, e.g. calibnet or mainnet' if ARGV.empty? + +# Current datetime, to append to the log files +DATE = Time.new.strftime '%FT%H:%M:%S' +LOG_EXPORT_SCRIPT_RUN = "logs/#{CHAIN_NAME}_#{DATE}_script_run.txt" +LOG_EXPORT_DAEMON = "logs/#{CHAIN_NAME}_#{DATE}_daemon.txt" +LOG_EXPORT_METRICS = "logs/#{CHAIN_NAME}_#{DATE}_metrics.txt" + +client = SlackClient.new CHANNEL, SLACK_TOKEN + +# Query the date of the most recent snapshot. This is used to limit the number +# of victory messages to 1/day even if we upload multiple snapshots per day. +date_before_export = latest_snapshot_date(CHAIN_NAME) + +# conditionally add timestamps to logs without timestamps +add_timestamps_cmd = %q[awk '{ if ($0 !~ /^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{6}Z/) print strftime("[%Y-%m-%d %H:%M:%S]"), $0; else print $0; fflush(); }'] +upload_cmd = "set -o pipefail && \ +timeout --signal=KILL 8h ./upload_snapshot.sh #{CHAIN_NAME} #{LOG_EXPORT_DAEMON} #{LOG_EXPORT_METRICS} | #{add_timestamps_cmd}" + +# The command needs to be run indirectly to avoid syntax errors in the shell. +snapshot_uploaded = system('bash', '-c', upload_cmd, %i[out err] => LOG_EXPORT_SCRIPT_RUN) + +if snapshot_uploaded + date_after_export = latest_snapshot_date(CHAIN_NAME) + + # If this is the first new snapshot of the day, send a victory message to slack + unless date_before_export == date_after_export + client.post_message "✅ Snapshot uploaded for #{CHAIN_NAME}. 🌲🌳🌲🌳🌲" + end +else + client.post_message "⛔ Snapshot failed for #{CHAIN_NAME}. 🔥🌲🔥 " + # attach the log file and print the contents to STDOUT + [LOG_EXPORT_SCRIPT_RUN, LOG_EXPORT_DAEMON, LOG_EXPORT_METRICS].each do |log_file| + client.attach_files(log_file) if File.exist?(log_file) + end +end + +[LOG_EXPORT_SCRIPT_RUN, LOG_EXPORT_DAEMON, LOG_EXPORT_METRICS].each do |log_file| + puts "Snapshot export log:\n#{File.read(log_file)}\n\n" if File.exist?(log_file) +end diff --git a/tf-managed/modules/daily-snapshot/service/forest-env.tpl b/tf-managed/modules/daily-snapshot/service/forest-env.tpl new file mode 100644 index 000000000..14f644c7a --- /dev/null +++ b/tf-managed/modules/daily-snapshot/service/forest-env.tpl @@ -0,0 +1,11 @@ +export R2_ACCESS_KEY="${R2_ACCESS_KEY}" +export R2_SECRET_KEY="${R2_SECRET_KEY}" +export R2_ENDPOINT="${r2_endpoint}" +export SLACK_API_TOKEN="${slack_token}" +export SLACK_NOTIF_CHANNEL="${slack_channel}" +export SNAPSHOT_BUCKET="${snapshot_bucket}" +export NEW_RELIC_API_KEY="${NEW_RELIC_API_KEY}" +export NEW_RELIC_ACCOUNT_ID="${NEW_RELIC_ACCOUNT_ID}" +export NEW_RELIC_REGION="${NEW_RELIC_REGION}" +export BASE_FOLDER="${BASE_FOLDER}" +export FOREST_TAG="${forest_tag}" diff --git a/tf-managed/modules/daily-snapshot/service/init.sh b/tf-managed/modules/daily-snapshot/service/init.sh new file mode 100755 index 000000000..2ad8d923a --- /dev/null +++ b/tf-managed/modules/daily-snapshot/service/init.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +set -eux + +# Wait for cloud-init to finish initializing the machine +cloud-init status --wait + +# Setting DEBIAN_FRONTEND to ensure non-interactive operations for APT +export DEBIAN_FRONTEND=noninteractive + +# Using timeout to ensure the script retries if the APT servers are temporarily unavailable. +timeout 10m bash -c 'until apt-get -qqq --yes update && \ + apt-get -qqq --yes install ruby ruby-dev anacron awscli; do sleep 10; \ +done' + +# Install the gems +gem install docker-api slack-ruby-client +gem install activesupport -v 7.0.8 + +# 1. Configure aws +# 2. Create forest_db directory +# 3. Copy scripts to /etc/cron.hourly + +## Configure aws +aws configure set default.s3.multipart_chunksize 4GB +aws configure set aws_access_key_id "$R2_ACCESS_KEY" +aws configure set aws_secret_access_key "$R2_SECRET_KEY" + +## Create forest data directory +mkdir forest_db logs +chmod 777 forest_db logs +mkdir --parents -- "$BASE_FOLDER/forest_db/filops" + +# Make the scripts executable +chmod +x ./upload_filops_snapshot.sh + +# Run new_relic and fail2ban scripts +bash newrelic_fail2ban.sh & + +# Setup cron jobs +cp calibnet_cron_job mainnet_cron_job /etc/cron.hourly/ diff --git a/tf-managed/modules/daily-snapshot/service/mainnet_cron_job b/tf-managed/modules/daily-snapshot/service/mainnet_cron_job new file mode 100755 index 000000000..24eb56170 --- /dev/null +++ b/tf-managed/modules/daily-snapshot/service/mainnet_cron_job @@ -0,0 +1,7 @@ +#!/bin/bash + +# shellcheck source=/dev/null +source ~/.forest_env +cd "$BASE_FOLDER" || exit +flock -n /tmp/mainnet.lock -c "ruby daily_snapshot.rb mainnet > mainnet_log.txt 2>&1" || exit +flock -n /tmp/mainnet_filops.lock -c "./upload_filops_snapshot.sh mainnet > filops_mainnet_log.txt 2>&1" || exit diff --git a/tf-managed/modules/daily-snapshot/service/newrelic_fail2ban.sh b/tf-managed/modules/daily-snapshot/service/newrelic_fail2ban.sh new file mode 100644 index 000000000..7e608ff65 --- /dev/null +++ b/tf-managed/modules/daily-snapshot/service/newrelic_fail2ban.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# This script configures New Relic infrastructure monitoring and Fail2Ban. +# It sets up the New Relic license key and custom configuration, adds the New Relic repository, +# refreshes it, and installs the New Relic infrastructure agent. +# It also installs Fail2Ban, sets up its default configuration, and enables it to start at boot + +set -euo pipefail +# If new relic API key is provided, install the new relic agent +if [ -n "$NEW_RELIC_API_KEY" ] ; then + curl -Ls https://download.newrelic.com/install/newrelic-cli/scripts/install.sh | bash && \ + sudo NEW_RELIC_API_KEY="$NEW_RELIC_API_KEY" \ + NEW_RELIC_ACCOUNT_ID="$NEW_RELIC_ACCOUNT_ID" \ + NEW_RELIC_REGION="$NEW_RELIC_REGION" \ + /usr/local/bin/newrelic install -y + +# The provided configurations are specific to New Relic. To gain a deeper understanding of these configuration details, you can visit: +# https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent/configuration/infrastructure-agent-configuration-settings/#offline-time-to-reset +cat >> /etc/newrelic-infra.yml < /etc/newrelic-infra/logging.d/logging.yml <> "$LOG_EXPORT_METRICS" + sleep 15 + done +} + +function print_forest_logs { + cat forest.err forest.out > $LOG_EXPORT_DAEMON +} +trap print_forest_logs EXIT + +echo "[client]" > config.toml +echo 'data_dir = "/home/forest/forest_db"' >> config.toml +echo 'encrypt_keystore = false' >> config.toml + +echo "Chain: $CHAIN_NAME" + +# spawn a task in the background to periodically write Prometheus metrics to a file +( + set +x # Disable debugging for this subshell to keep the logs clean. + write_metrics +) & + +forest-tool db destroy --force --config config.toml --chain "$CHAIN_NAME" + +forest --config config.toml --chain "$CHAIN_NAME" --auto-download-snapshot --halt-after-import +forest --config config.toml --chain "$CHAIN_NAME" --no-gc --save-token=token.txt --detach +timeout "$SYNC_TIMEOUT" forest-cli sync wait +forest-cli snapshot export -o forest_db/ +forest-cli --token=\$(cat token.txt) shutdown --force + +# Run full checks only for calibnet, given that it takes too long for mainnet. +if [ "$CHAIN_NAME" = "calibnet" ]; then + forest-tool snapshot validate --check-network "$CHAIN_NAME" forest_db/forest_snapshot_*.forest.car.zst +else + forest-tool archive info forest_db/forest_snapshot_*.forest.car.zst + forest-tool snapshot validate --check-links 0 --check-network "$CHAIN_NAME" --check-stateroots 5 forest_db/forest_snapshot_*.forest.car.zst +fi + + +# Kill the metrics writer process +kill %1 + +HEREDOC +) + +# Stop any lingering docker containers +CONTAINER_NAME="forest-snapshot-upload-node-$CHAIN_NAME" +docker stop "$CONTAINER_NAME" || true +docker rm --force "$CONTAINER_NAME" + +CHAIN_DB_DIR="$BASE_FOLDER/forest_db/$CHAIN_NAME" +CHAIN_LOGS_DIR="$BASE_FOLDER/logs" + +# Delete any existing snapshot files. It may be that the previous run failed +# before deleting those. +rm "$CHAIN_DB_DIR/forest_snapshot_$CHAIN_NAME"* + +# Run forest and generate a snapshot in forest_db/ +docker run \ + --name "$CONTAINER_NAME" \ + --rm \ + --user root \ + -v "$CHAIN_DB_DIR:/home/forest/forest_db":z \ + -v "$CHAIN_LOGS_DIR:/home/forest/logs":z \ + --entrypoint /bin/bash \ + ghcr.io/chainsafe/forest:"${FOREST_TAG}" \ + -c "$COMMANDS" || exit 1 + +aws --endpoint "$R2_ENDPOINT" s3 cp --no-progress "$CHAIN_DB_DIR/forest_snapshot_$CHAIN_NAME"*.forest.car.zst s3://forest-archive/"$CHAIN_NAME"/latest/ || exit 1 + +# Delete snapshot files +rm "$CHAIN_DB_DIR/forest_snapshot_$CHAIN_NAME"* diff --git a/tf-managed/modules/daily-snapshot/variable.tf b/tf-managed/modules/daily-snapshot/variable.tf new file mode 100644 index 000000000..104840b50 --- /dev/null +++ b/tf-managed/modules/daily-snapshot/variable.tf @@ -0,0 +1,116 @@ +variable "digitalocean_token" { + description = "Token for authentication." + type = string + sensitive = true +} + +variable "name" { + description = "The name of Forest Droplet" + type = string +} + +variable "size" { + description = "The size of the droplet instance to launch" + type = string +} + +variable "slack_channel" { + description = "slack channel name for notifications" + type = string +} + +variable "slack_token" { + description = "slack access token" + type = string + sensitive = true +} + +variable "R2_ACCESS_KEY" { + description = "S3 access key id" + type = string + sensitive = true +} + +variable "R2_SECRET_KEY" { + description = "S3 private access key" + type = string + sensitive = true +} + +variable "snapshot_bucket" { + description = "S3 bucket containing the snapshots" + type = string + default = "forest-snapshots" +} + +variable "r2_endpoint" { + description = "R2 endpoint for the snapshots" + type = string +} + +variable "snapshot_endpoint" { + description = "S3 endpoint for the snapshots" + type = string + default = "https://fra1.digitaloceanspaces.com/" +} + +variable "forest_tag" { + description = "Image tag for the Forest container" + type = string + default = "latest" +} + +variable "image" { + description = "The ID of the AMI to use for the Droplet" + type = string + default = "docker-20-04" +} + +variable "region" { + description = "The region where resources will be created" + type = string + default = "fra1" +} + +variable "project" { + description = "DigitalOcean project used as parent for the created droplet" + type = string + default = "Forest-DEV" # Alternative: "Default" +} + +variable "source_addresses" { + description = "List of source addresses." + type = list(string) + default = ["0.0.0.0/0", "::/0"] +} + +variable "destination_addresses" { + description = "List of destination addresses." + type = list(string) + default = ["0.0.0.0/0", "::/0"] +} + +variable "NEW_RELIC_REGION" { + description = "The New Relic Platform Region" + type = string + default = "EU" +} + +variable "NEW_RELIC_API_KEY" { + description = "New Relic API KEY" + default = "" + type = string + sensitive = true +} + +variable "NEW_RELIC_ACCOUNT_ID" { + description = "The New Relic Account ID" + default = "" + type = string + sensitive = true +} + +variable "environment" { + description = "The environment name" + type = string +} diff --git a/tf-managed/modules/sync-check/main.tf b/tf-managed/modules/sync-check/main.tf index 26965bc60..7cd257fd6 100644 --- a/tf-managed/modules/sync-check/main.tf +++ b/tf-managed/modules/sync-check/main.tf @@ -86,12 +86,12 @@ locals { resource "digitalocean_droplet" "forest" { image = var.image - name = format("%s-%s", var.env, var.name) + name = format("%s-%s", var.environment, var.name) region = var.region size = var.size # Re-initialize resource if this hash changes: user_data = join("-", [data.local_file.sources.content_sha256, sha256(join("", local.init_commands))]) - tags = ["iac"] + tags = ["iac", var.environment] ssh_keys = data.digitalocean_ssh_keys.keys.ssh_keys[*].fingerprint monitoring = true diff --git a/tf-managed/modules/sync-check/variables.tf b/tf-managed/modules/sync-check/variables.tf index 4b3cc1db6..232caee43 100644 --- a/tf-managed/modules/sync-check/variables.tf +++ b/tf-managed/modules/sync-check/variables.tf @@ -67,7 +67,7 @@ variable "common_resources_dir" { type = string } -variable "env" { +variable "environment" { description = "The environment name" type = string } From 1b5d8575a073a30cfefb28ea34f8602d7c219866 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Tue, 2 Jan 2024 16:42:58 +0100 Subject: [PATCH 07/46] adapt daily snapshot --- .../snapshot-service/terragrunt.hcl | 3 +++ .../applications/sync-check/terragrunt.hcl | 7 +++-- tf-managed/live/terragrunt.hcl | 2 +- tf-managed/modules/daily-snapshot/main.tf | 4 +-- .../daily-snapshot/service/daily_snapshot.rb | 26 +------------------ .../daily-snapshot/service/upload_snapshot.sh | 2 +- tf-managed/modules/daily-snapshot/variable.tf | 4 +++ 7 files changed, 17 insertions(+), 31 deletions(-) diff --git a/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl b/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl index 6090cfe9c..4bb456d8e 100644 --- a/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl +++ b/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl @@ -12,4 +12,7 @@ terraform { inputs = { name = "forest-snapshot" size = "s-4vcpu-16gb-amd" + r2_endpoint = "https://2238a825c5aca59233eab1f221f7aefb.r2.cloudflarestorage.com/" + forest_tag = "latest" + snapshot_bucket = "forest-archive-dev" } diff --git a/tf-managed/live/environments/prod/applications/sync-check/terragrunt.hcl b/tf-managed/live/environments/prod/applications/sync-check/terragrunt.hcl index d8961db58..4ae7a2e1c 100644 --- a/tf-managed/live/environments/prod/applications/sync-check/terragrunt.hcl +++ b/tf-managed/live/environments/prod/applications/sync-check/terragrunt.hcl @@ -6,10 +6,13 @@ include "root" { # Load the actual Terraform module terraform { - source = format("%s/../modules/sync-check", get_parent_terragrunt_dir()) + source = format("%s/../modules/daily-snapshot", get_parent_terragrunt_dir()) } inputs = { - name = "sync-check" + name = "forest-snapshot" size = "s-4vcpu-16gb-amd" + r2_endpoint = "https://2238a825c5aca59233eab1f221f7aefb.r2.cloudflarestorage.com/" + forest_tag = "v0.16.4" + snapshot_bucket = "forest-archive" } diff --git a/tf-managed/live/terragrunt.hcl b/tf-managed/live/terragrunt.hcl index 6ee0099d9..b98fa70af 100644 --- a/tf-managed/live/terragrunt.hcl +++ b/tf-managed/live/terragrunt.hcl @@ -39,5 +39,5 @@ remote_state { inputs = { common_resources_dir = format("%s/../common", get_parent_terragrunt_dir()) slack_channel = (local.env == "prod" ? "#forest-notifications" : "#forest-dump") - env = local.env + environment = local.env } diff --git a/tf-managed/modules/daily-snapshot/main.tf b/tf-managed/modules/daily-snapshot/main.tf index f83798ead..b260b398d 100644 --- a/tf-managed/modules/daily-snapshot/main.tf +++ b/tf-managed/modules/daily-snapshot/main.tf @@ -31,7 +31,7 @@ provider "digitalocean" { // Ugly hack because 'archive_file' cannot mix files and folders. data "external" "sources_tar" { - program = ["sh", "${path.module}/prep_sources.sh", path.module] + program = ["sh", "${path.module}/prep_sources.sh", path.module, var.common_resources_dir] } @@ -129,7 +129,7 @@ resource "digitalocean_project_resources" "connect_forest_project" { } resource "digitalocean_firewall" "forest-firewall" { - name = var.name + name = format("%s-%s", var.environment, var.name) inbound_rule { protocol = "tcp" diff --git a/tf-managed/modules/daily-snapshot/service/daily_snapshot.rb b/tf-managed/modules/daily-snapshot/service/daily_snapshot.rb index fa8e3ff29..dcc3be7c0 100644 --- a/tf-managed/modules/daily-snapshot/service/daily_snapshot.rb +++ b/tf-managed/modules/daily-snapshot/service/daily_snapshot.rb @@ -7,24 +7,11 @@ require 'date' require 'logger' require 'fileutils' -require 'active_support/time' BASE_FOLDER = get_and_assert_env_variable 'BASE_FOLDER' SLACK_TOKEN = get_and_assert_env_variable 'SLACK_API_TOKEN' CHANNEL = get_and_assert_env_variable 'SLACK_NOTIF_CHANNEL' -# Query the date of the most recent snapshot. -def latest_snapshot_date(chain_name = 'calibnet') - # We do not support HEAD requests but we _do_ support empty ranges. - filename = `curl --remote-name --remote-header-name --write-out "%{filename_effective}" --silent https://forest-archive.chainsafe.dev/latest/#{chain_name}/ -H "Range: bytes=0-0"` - # Curl will create a file with a single byte in it. Let's clean it up. - File.delete(filename) - snapshot_format = /^([^_]+?)_snapshot_(?[^_]+?)_(?\d{4}-\d{2}-\d{2})_height_(?\d+)(\.forest)?\.car.zst$/ - filename.match(snapshot_format) do |m| - m[:date].to_date - end -end - CHAIN_NAME = ARGV[0] raise 'No chain name supplied. Please provide chain identifier, e.g. calibnet or mainnet' if ARGV.empty? @@ -36,10 +23,6 @@ def latest_snapshot_date(chain_name = 'calibnet') client = SlackClient.new CHANNEL, SLACK_TOKEN -# Query the date of the most recent snapshot. This is used to limit the number -# of victory messages to 1/day even if we upload multiple snapshots per day. -date_before_export = latest_snapshot_date(CHAIN_NAME) - # conditionally add timestamps to logs without timestamps add_timestamps_cmd = %q[awk '{ if ($0 !~ /^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{6}Z/) print strftime("[%Y-%m-%d %H:%M:%S]"), $0; else print $0; fflush(); }'] upload_cmd = "set -o pipefail && \ @@ -48,14 +31,7 @@ def latest_snapshot_date(chain_name = 'calibnet') # The command needs to be run indirectly to avoid syntax errors in the shell. snapshot_uploaded = system('bash', '-c', upload_cmd, %i[out err] => LOG_EXPORT_SCRIPT_RUN) -if snapshot_uploaded - date_after_export = latest_snapshot_date(CHAIN_NAME) - - # If this is the first new snapshot of the day, send a victory message to slack - unless date_before_export == date_after_export - client.post_message "✅ Snapshot uploaded for #{CHAIN_NAME}. 🌲🌳🌲🌳🌲" - end -else +unless snapshot_uploaded client.post_message "⛔ Snapshot failed for #{CHAIN_NAME}. 🔥🌲🔥 " # attach the log file and print the contents to STDOUT [LOG_EXPORT_SCRIPT_RUN, LOG_EXPORT_DAEMON, LOG_EXPORT_METRICS].each do |log_file| diff --git a/tf-managed/modules/daily-snapshot/service/upload_snapshot.sh b/tf-managed/modules/daily-snapshot/service/upload_snapshot.sh index ff908d331..7cbd18dff 100755 --- a/tf-managed/modules/daily-snapshot/service/upload_snapshot.sh +++ b/tf-managed/modules/daily-snapshot/service/upload_snapshot.sh @@ -108,7 +108,7 @@ docker run \ ghcr.io/chainsafe/forest:"${FOREST_TAG}" \ -c "$COMMANDS" || exit 1 -aws --endpoint "$R2_ENDPOINT" s3 cp --no-progress "$CHAIN_DB_DIR/forest_snapshot_$CHAIN_NAME"*.forest.car.zst s3://forest-archive/"$CHAIN_NAME"/latest/ || exit 1 +aws --endpoint "$R2_ENDPOINT" s3 cp --no-progress "$CHAIN_DB_DIR/forest_snapshot_$CHAIN_NAME"*.forest.car.zst s3://"$SNAPSHOT_BUCKET"/"$CHAIN_NAME"/latest/ || exit 1 # Delete snapshot files rm "$CHAIN_DB_DIR/forest_snapshot_$CHAIN_NAME"* diff --git a/tf-managed/modules/daily-snapshot/variable.tf b/tf-managed/modules/daily-snapshot/variable.tf index 104840b50..f3c347302 100644 --- a/tf-managed/modules/daily-snapshot/variable.tf +++ b/tf-managed/modules/daily-snapshot/variable.tf @@ -110,6 +110,10 @@ variable "NEW_RELIC_ACCOUNT_ID" { sensitive = true } +variable "common_resources_dir" { + type = string +} + variable "environment" { description = "The environment name" type = string From 596cb782f7d9a753128db1876aec424252c41c9f Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Tue, 2 Jan 2024 17:59:57 +0100 Subject: [PATCH 08/46] defragment snapshot main --- tf-managed/modules/daily-snapshot/firewall.tf | 41 +++++++++++ tf-managed/modules/daily-snapshot/main.tf | 71 ------------------- tf-managed/modules/daily-snapshot/outputs.tf | 4 ++ tf-managed/modules/daily-snapshot/provider.tf | 23 ++++++ 4 files changed, 68 insertions(+), 71 deletions(-) create mode 100644 tf-managed/modules/daily-snapshot/firewall.tf create mode 100644 tf-managed/modules/daily-snapshot/outputs.tf create mode 100644 tf-managed/modules/daily-snapshot/provider.tf diff --git a/tf-managed/modules/daily-snapshot/firewall.tf b/tf-managed/modules/daily-snapshot/firewall.tf new file mode 100644 index 000000000..73c324bfb --- /dev/null +++ b/tf-managed/modules/daily-snapshot/firewall.tf @@ -0,0 +1,41 @@ +resource "digitalocean_firewall" "forest-firewall" { + name = format("%s-%s", var.environment, var.name) + + inbound_rule { + protocol = "tcp" + port_range = "22" + source_addresses = var.source_addresses + } + + inbound_rule { + protocol = "tcp" + port_range = "2345" + source_addresses = var.source_addresses + } + + inbound_rule { + protocol = "tcp" + port_range = "80" + source_addresses = var.source_addresses + } + + inbound_rule { + protocol = "udp" + port_range = "53" + source_addresses = var.source_addresses + } + + outbound_rule { + protocol = "tcp" + port_range = "all" + destination_addresses = var.destination_addresses + } + + outbound_rule { + protocol = "udp" + port_range = "53" + destination_addresses = var.destination_addresses + } + + droplet_ids = [digitalocean_droplet.forest.id] +} diff --git a/tf-managed/modules/daily-snapshot/main.tf b/tf-managed/modules/daily-snapshot/main.tf index b260b398d..b2c954f43 100644 --- a/tf-managed/modules/daily-snapshot/main.tf +++ b/tf-managed/modules/daily-snapshot/main.tf @@ -5,30 +5,6 @@ # - Copy over the zip file # - Run the init.sh script in the background -terraform { - required_version = "~> 1.3" - - required_providers { - digitalocean = { - source = "digitalocean/digitalocean" - version = "~> 2.0" - } - external = { - source = "hashicorp/external" - version = "~> 2.1" - } - local = { - source = "hashicorp/local" - version = "~> 2.1" - } - - } -} - -provider "digitalocean" { - token = var.digitalocean_token -} - // Ugly hack because 'archive_file' cannot mix files and folders. data "external" "sources_tar" { program = ["sh", "${path.module}/prep_sources.sh", path.module, var.common_resources_dir] @@ -127,50 +103,3 @@ resource "digitalocean_project_resources" "connect_forest_project" { project = data.digitalocean_project.forest_project.id resources = [digitalocean_droplet.forest.urn] } - -resource "digitalocean_firewall" "forest-firewall" { - name = format("%s-%s", var.environment, var.name) - - inbound_rule { - protocol = "tcp" - port_range = "22" - source_addresses = var.source_addresses - } - - inbound_rule { - protocol = "tcp" - port_range = "2345" - source_addresses = var.source_addresses - } - - inbound_rule { - protocol = "tcp" - port_range = "80" - source_addresses = var.source_addresses - } - - inbound_rule { - protocol = "udp" - port_range = "53" - source_addresses = var.source_addresses - } - - outbound_rule { - protocol = "tcp" - port_range = "all" - destination_addresses = var.destination_addresses - } - - outbound_rule { - protocol = "udp" - port_range = "53" - destination_addresses = var.destination_addresses - } - - droplet_ids = [digitalocean_droplet.forest.id] -} - -# This ip address may be used in the future by monitoring software -output "ip" { - value = [digitalocean_droplet.forest.ipv4_address] -} diff --git a/tf-managed/modules/daily-snapshot/outputs.tf b/tf-managed/modules/daily-snapshot/outputs.tf new file mode 100644 index 000000000..240c103f1 --- /dev/null +++ b/tf-managed/modules/daily-snapshot/outputs.tf @@ -0,0 +1,4 @@ +# This ip address may be used in the future by monitoring software +output "ip" { + value = [digitalocean_droplet.forest.ipv4_address] +} diff --git a/tf-managed/modules/daily-snapshot/provider.tf b/tf-managed/modules/daily-snapshot/provider.tf new file mode 100644 index 000000000..b92ed0b88 --- /dev/null +++ b/tf-managed/modules/daily-snapshot/provider.tf @@ -0,0 +1,23 @@ +terraform { + required_version = "~> 1.3" + + required_providers { + digitalocean = { + source = "digitalocean/digitalocean" + version = "~> 2.0" + } + external = { + source = "hashicorp/external" + version = "~> 2.1" + } + local = { + source = "hashicorp/local" + version = "~> 2.1" + } + + } +} + +provider "digitalocean" { + token = var.digitalocean_token +} From 6833137e4f3c4dca2d1dc8d0ccb6d2fb8dcfe3ba Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Tue, 2 Jan 2024 19:53:52 +0100 Subject: [PATCH 09/46] monitoring wip --- .../snapshot-service/terragrunt.hcl | 1 + tf-managed/modules/daily-snapshot/main.tf | 19 ++++++--- .../modules/daily-snapshot/monitoring/main.tf | 33 +++++++++++++++ .../daily-snapshot/monitoring/provider.tf | 16 ++++++++ .../daily-snapshot/monitoring/variable.tf | 41 +++++++++++++++++++ tf-managed/modules/daily-snapshot/provider.tf | 11 ++++- tf-managed/modules/daily-snapshot/variable.tf | 16 +++++--- 7 files changed, 125 insertions(+), 12 deletions(-) create mode 100644 tf-managed/modules/daily-snapshot/monitoring/main.tf create mode 100644 tf-managed/modules/daily-snapshot/monitoring/provider.tf create mode 100644 tf-managed/modules/daily-snapshot/monitoring/variable.tf diff --git a/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl b/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl index 4bb456d8e..63156f712 100644 --- a/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl +++ b/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl @@ -15,4 +15,5 @@ inputs = { r2_endpoint = "https://2238a825c5aca59233eab1f221f7aefb.r2.cloudflarestorage.com/" forest_tag = "latest" snapshot_bucket = "forest-archive-dev" + monitoring = true } diff --git a/tf-managed/modules/daily-snapshot/main.tf b/tf-managed/modules/daily-snapshot/main.tf index b2c954f43..8c109f126 100644 --- a/tf-managed/modules/daily-snapshot/main.tf +++ b/tf-managed/modules/daily-snapshot/main.tf @@ -18,8 +18,7 @@ data "local_file" "sources" { // Note: The init.sh file is also included in the sources.zip such that the hash // of the archive captures the entire state of the machine. // This is a workaround, and because of this, we need to suppress the tflint warning here -// for unused declarations related to the 'init.sh' file. -// tflint-ignore: terraform_unused_declarations +// for unused declarations related to the 'init.sh' file. tflint-ignore: terraform_unused_declarations data "local_file" "init" { filename = "${path.module}/service/init.sh" } @@ -41,9 +40,9 @@ locals { slack_channel = var.slack_channel, snapshot_bucket = var.snapshot_bucket, snapshot_endpoint = var.snapshot_endpoint, - NEW_RELIC_API_KEY = var.NEW_RELIC_API_KEY, - NEW_RELIC_ACCOUNT_ID = var.NEW_RELIC_ACCOUNT_ID, - NEW_RELIC_REGION = var.NEW_RELIC_REGION, + NEW_RELIC_API_KEY = var.new_relic_api_key, + NEW_RELIC_ACCOUNT_ID = var.new_relic_account_id, + NEW_RELIC_REGION = var.new_relic_region, BASE_FOLDER = "/root", forest_tag = var.forest_tag }) @@ -60,11 +59,13 @@ locals { # Exiting without a sleep sometimes kills the script :-/ "sleep 60s" ] + + service_name = format("%s-%s", var.environment, var.name) } resource "digitalocean_droplet" "forest" { image = var.image - name = format("%s-%s", var.environment, var.name) + name = local.service_name region = var.region size = var.size # Re-initialize resource if this hash changes: @@ -103,3 +104,9 @@ resource "digitalocean_project_resources" "connect_forest_project" { project = data.digitalocean_project.forest_project.id resources = [digitalocean_droplet.forest.urn] } + +module "monitoring" { + count = var.monitoring ? 1 : 0 + source = "./monitoring" + service_name = local.service_name +} diff --git a/tf-managed/modules/daily-snapshot/monitoring/main.tf b/tf-managed/modules/daily-snapshot/monitoring/main.tf new file mode 100644 index 000000000..9ff6b3a4a --- /dev/null +++ b/tf-managed/modules/daily-snapshot/monitoring/main.tf @@ -0,0 +1,33 @@ +# Creation of a new New Relic alert policy for infrastructure or Contianer downtime +resource "newrelic_alert_policy" "alert" { + name = format("%s alert policy", var.service_name) +} + +# NRQL alert conditions for events such as host down, high disk/memory use, +# and container down, each with defined criteria and thresholds. +resource "newrelic_nrql_alert_condition" "disk_space" { + policy_id = newrelic_alert_policy.alert.id + type = "static" + name = "High Disk Utilization" + description = "Alert when disk space usage is high on an the service host" + enabled = true + violation_time_limit_seconds = 3600 + + nrql { + query = "SELECT latest(diskUsedPercent) FROM StorageSample where entityName = '${var.service_name}'" + } + + critical { + operator = "above" + threshold = 85.0 + threshold_duration = 300 + threshold_occurrences = "ALL" + } + + warning { + operator = "above" + threshold = 70.0 + threshold_duration = 300 + threshold_occurrences = "ALL" + } +} diff --git a/tf-managed/modules/daily-snapshot/monitoring/provider.tf b/tf-managed/modules/daily-snapshot/monitoring/provider.tf new file mode 100644 index 000000000..29e275bd6 --- /dev/null +++ b/tf-managed/modules/daily-snapshot/monitoring/provider.tf @@ -0,0 +1,16 @@ +terraform { + required_version = "~> 1.3" + required_providers { + newrelic = { + source = "newrelic/newrelic" + version = "~> 3.0" + } + } +} + +# # Configure the New Relic provider +# provider "newrelic" { +# account_id = var.nr_account_id +# api_key = var.nr_api_key +# region = var.nr_region +# } diff --git a/tf-managed/modules/daily-snapshot/monitoring/variable.tf b/tf-managed/modules/daily-snapshot/monitoring/variable.tf new file mode 100644 index 000000000..43ecef6db --- /dev/null +++ b/tf-managed/modules/daily-snapshot/monitoring/variable.tf @@ -0,0 +1,41 @@ +# variable "new_relic_account_id" { +# type = string +# description = "The New Relic Account ID" +# sensitive = true +# } +# +# variable "new_relic_api_key" { +# description = "The New Relic API KEY" +# type = string +# sensitive = true +# } +# +# variable "new_relic_region" { +# description = "The New Relic Region" +# type = string +# } + +variable "service_name" { + description = "The name of the service" + type = string +} + +variable "enable_slack_notifications" { + description = "Enable Slack notifications" + type = bool + default = false +} + +# variable "slack_destination_id" { +# description = "The unique identifier for the Slack workspace where notifications will be sent." +# # TODO: parametrize +# default = "f902e020-5993-4425-9ae3-133084fc870d" +# type = string +# } +# +# variable "slack_channel_id" { +# description = "The unique identifier for the Slack channel(forest-notifications), where notifications will be posted." +# type = string +# # TODO: parametrize +# default = "C036TCEF0CU" +# } diff --git a/tf-managed/modules/daily-snapshot/provider.tf b/tf-managed/modules/daily-snapshot/provider.tf index b92ed0b88..7b86d4626 100644 --- a/tf-managed/modules/daily-snapshot/provider.tf +++ b/tf-managed/modules/daily-snapshot/provider.tf @@ -14,10 +14,19 @@ terraform { source = "hashicorp/local" version = "~> 2.1" } - + newrelic = { + source = "newrelic/newrelic" + version = "~> 3.0" + } } } provider "digitalocean" { token = var.digitalocean_token } + +provider "newrelic" { + account_id = var.new_relic_account_id + api_key = var.new_relic_api_key + region = var.new_relic_region +} diff --git a/tf-managed/modules/daily-snapshot/variable.tf b/tf-managed/modules/daily-snapshot/variable.tf index f3c347302..4ff30b8f2 100644 --- a/tf-managed/modules/daily-snapshot/variable.tf +++ b/tf-managed/modules/daily-snapshot/variable.tf @@ -90,23 +90,23 @@ variable "destination_addresses" { default = ["0.0.0.0/0", "::/0"] } -variable "NEW_RELIC_REGION" { +variable "new_relic_region" { description = "The New Relic Platform Region" type = string default = "EU" } -variable "NEW_RELIC_API_KEY" { +variable "new_relic_api_key" { description = "New Relic API KEY" default = "" type = string sensitive = true } -variable "NEW_RELIC_ACCOUNT_ID" { +variable "new_relic_account_id" { description = "The New Relic Account ID" - default = "" - type = string + default = 0 + type = number sensitive = true } @@ -118,3 +118,9 @@ variable "environment" { description = "The environment name" type = string } + +variable "monitoring" { + description = "Enable monitoring" + type = bool + default = false +} From 35026eb4e6f2bff21d6e15006fa6061828ba9098 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 2 Jan 2024 18:54:10 +0000 Subject: [PATCH 10/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tf-managed/modules/daily-snapshot/monitoring/variable.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tf-managed/modules/daily-snapshot/monitoring/variable.tf b/tf-managed/modules/daily-snapshot/monitoring/variable.tf index 43ecef6db..7f68d0eab 100644 --- a/tf-managed/modules/daily-snapshot/monitoring/variable.tf +++ b/tf-managed/modules/daily-snapshot/monitoring/variable.tf @@ -3,13 +3,13 @@ # description = "The New Relic Account ID" # sensitive = true # } -# +# # variable "new_relic_api_key" { # description = "The New Relic API KEY" # type = string # sensitive = true # } -# +# # variable "new_relic_region" { # description = "The New Relic Region" # type = string @@ -32,7 +32,7 @@ variable "enable_slack_notifications" { # default = "f902e020-5993-4425-9ae3-133084fc870d" # type = string # } -# +# # variable "slack_channel_id" { # description = "The unique identifier for the Slack channel(forest-notifications), where notifications will be posted." # type = string From b08eefb5cbf8de469b8e8a89a1c4a8ce625b650a Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Tue, 2 Jan 2024 19:55:40 +0100 Subject: [PATCH 11/46] remove unused --- .../daily-snapshot/monitoring/provider.tf | 7 ----- .../daily-snapshot/monitoring/variable.tf | 31 ------------------- 2 files changed, 38 deletions(-) diff --git a/tf-managed/modules/daily-snapshot/monitoring/provider.tf b/tf-managed/modules/daily-snapshot/monitoring/provider.tf index 29e275bd6..2e46564f7 100644 --- a/tf-managed/modules/daily-snapshot/monitoring/provider.tf +++ b/tf-managed/modules/daily-snapshot/monitoring/provider.tf @@ -7,10 +7,3 @@ terraform { } } } - -# # Configure the New Relic provider -# provider "newrelic" { -# account_id = var.nr_account_id -# api_key = var.nr_api_key -# region = var.nr_region -# } diff --git a/tf-managed/modules/daily-snapshot/monitoring/variable.tf b/tf-managed/modules/daily-snapshot/monitoring/variable.tf index 7f68d0eab..80d99e59d 100644 --- a/tf-managed/modules/daily-snapshot/monitoring/variable.tf +++ b/tf-managed/modules/daily-snapshot/monitoring/variable.tf @@ -1,20 +1,3 @@ -# variable "new_relic_account_id" { -# type = string -# description = "The New Relic Account ID" -# sensitive = true -# } -# -# variable "new_relic_api_key" { -# description = "The New Relic API KEY" -# type = string -# sensitive = true -# } -# -# variable "new_relic_region" { -# description = "The New Relic Region" -# type = string -# } - variable "service_name" { description = "The name of the service" type = string @@ -25,17 +8,3 @@ variable "enable_slack_notifications" { type = bool default = false } - -# variable "slack_destination_id" { -# description = "The unique identifier for the Slack workspace where notifications will be sent." -# # TODO: parametrize -# default = "f902e020-5993-4425-9ae3-133084fc870d" -# type = string -# } -# -# variable "slack_channel_id" { -# description = "The unique identifier for the Slack channel(forest-notifications), where notifications will be posted." -# type = string -# # TODO: parametrize -# default = "C036TCEF0CU" -# } From 7be2c6e638e0c6be8ecf92ae1683f50a4c5cda6d Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Wed, 3 Jan 2024 10:51:33 +0100 Subject: [PATCH 12/46] defragment sync check --- tf-managed/modules/sync-check/main.tf | 24 ----------------------- tf-managed/modules/sync-check/provider.tf | 23 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 24 deletions(-) create mode 100644 tf-managed/modules/sync-check/provider.tf diff --git a/tf-managed/modules/sync-check/main.tf b/tf-managed/modules/sync-check/main.tf index 7cd257fd6..03252b55f 100644 --- a/tf-managed/modules/sync-check/main.tf +++ b/tf-managed/modules/sync-check/main.tf @@ -5,30 +5,6 @@ # - Copy over the zip file # - Run calibnet and mainnet sync check in the background -terraform { - required_version = "~> 1.3" - - required_providers { - digitalocean = { - source = "digitalocean/digitalocean" - version = "~> 2.0" - } - external = { - source = "hashicorp/external" - version = "~> 2.1" - } - local = { - source = "hashicorp/local" - version = "~> 2.1" - } - - } -} - -provider "digitalocean" { - token = var.digitalocean_token -} - // Ugly hack because 'archive_file' cannot mix files and folders. data "external" "sources_tar" { program = ["sh", "${path.module}/prep_sources.sh", path.module, var.common_resources_dir] diff --git a/tf-managed/modules/sync-check/provider.tf b/tf-managed/modules/sync-check/provider.tf new file mode 100644 index 000000000..b92ed0b88 --- /dev/null +++ b/tf-managed/modules/sync-check/provider.tf @@ -0,0 +1,23 @@ +terraform { + required_version = "~> 1.3" + + required_providers { + digitalocean = { + source = "digitalocean/digitalocean" + version = "~> 2.0" + } + external = { + source = "hashicorp/external" + version = "~> 2.1" + } + local = { + source = "hashicorp/local" + version = "~> 2.1" + } + + } +} + +provider "digitalocean" { + token = var.digitalocean_token +} From 94d548b2501d57929f5d8e804cc4e9c64450a026 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Wed, 3 Jan 2024 13:07:37 +0100 Subject: [PATCH 13/46] fix state keying --- tf-managed/live/terragrunt.hcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tf-managed/live/terragrunt.hcl b/tf-managed/live/terragrunt.hcl index b98fa70af..4068b4372 100644 --- a/tf-managed/live/terragrunt.hcl +++ b/tf-managed/live/terragrunt.hcl @@ -19,7 +19,7 @@ remote_state { ? "hubert-bucket-prod" : "hubert-bucket-dev" ) - key = "${local.env}-terraform.tfstate" + key = "${path_relative_to_include()}/terraform.tfstate" region = "eu-west-1" endpoint = "https://fra1.digitaloceanspaces.com" skip_bucket_versioning = true From c9f90a9c8af6d0052caa05ed76cfbc718613ea3d Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Wed, 3 Jan 2024 14:55:26 +0100 Subject: [PATCH 14/46] mail alerts --- .../snapshot-service/terragrunt.hcl | 5 +- tf-managed/modules/daily-snapshot/main.tf | 3 +- .../modules/daily-snapshot/monitoring/main.tf | 61 +++++++++++++++++-- .../daily-snapshot/monitoring/provider.tf | 2 +- .../daily-snapshot/monitoring/variable.tf | 6 ++ tf-managed/modules/daily-snapshot/provider.tf | 2 +- tf-managed/modules/daily-snapshot/variable.tf | 13 +++- tf-managed/modules/sync-check/provider.tf | 2 +- 8 files changed, 80 insertions(+), 14 deletions(-) diff --git a/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl b/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl index 63156f712..61f168e46 100644 --- a/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl +++ b/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl @@ -15,5 +15,8 @@ inputs = { r2_endpoint = "https://2238a825c5aca59233eab1f221f7aefb.r2.cloudflarestorage.com/" forest_tag = "latest" snapshot_bucket = "forest-archive-dev" - monitoring = true + + monitoring = { + enable = true, + } } diff --git a/tf-managed/modules/daily-snapshot/main.tf b/tf-managed/modules/daily-snapshot/main.tf index 8c109f126..0b24fa005 100644 --- a/tf-managed/modules/daily-snapshot/main.tf +++ b/tf-managed/modules/daily-snapshot/main.tf @@ -106,7 +106,8 @@ resource "digitalocean_project_resources" "connect_forest_project" { } module "monitoring" { - count = var.monitoring ? 1 : 0 + count = var.monitoring.enable ? 1 : 0 source = "./monitoring" service_name = local.service_name + alert_email = var.monitoring.alert_email } diff --git a/tf-managed/modules/daily-snapshot/monitoring/main.tf b/tf-managed/modules/daily-snapshot/monitoring/main.tf index 9ff6b3a4a..8b65305e0 100644 --- a/tf-managed/modules/daily-snapshot/monitoring/main.tf +++ b/tf-managed/modules/daily-snapshot/monitoring/main.tf @@ -1,17 +1,18 @@ -# Creation of a new New Relic alert policy for infrastructure or Contianer downtime resource "newrelic_alert_policy" "alert" { name = format("%s alert policy", var.service_name) } -# NRQL alert conditions for events such as host down, high disk/memory use, -# and container down, each with defined criteria and thresholds. +locals { + enable_email = var.alert_email != "" +} + resource "newrelic_nrql_alert_condition" "disk_space" { policy_id = newrelic_alert_policy.alert.id type = "static" name = "High Disk Utilization" description = "Alert when disk space usage is high on an the service host" enabled = true - violation_time_limit_seconds = 3600 + # violation_time_limit_seconds = 3600 nrql { query = "SELECT latest(diskUsedPercent) FROM StorageSample where entityName = '${var.service_name}'" @@ -19,15 +20,63 @@ resource "newrelic_nrql_alert_condition" "disk_space" { critical { operator = "above" - threshold = 85.0 + # threshold = 85.0 + threshold = 20.0 threshold_duration = 300 threshold_occurrences = "ALL" } warning { operator = "above" - threshold = 70.0 + # threshold = 70.0 + threshold = 10.0 threshold_duration = 300 threshold_occurrences = "ALL" } } + +resource "newrelic_notification_destination" "email" { + count = local.enable_email ? 1 : 0 + name = format("%s email", var.service_name) + type = "EMAIL" + + property { + key = "email" + value = var.alert_email + } +} + +resource "newrelic_notification_channel" "email-channel" { + count = local.enable_email ? 1 : 0 + name = format("%s email", var.service_name) + type = "EMAIL" + product = "IINT" + destination_id = newrelic_notification_destination.email[0].id + + property { + key = "subject" + value = format("%s alert", var.service_name) + } +} + + +resource "newrelic_workflow" "alerting-workflow" { + count = local.enable_email ? 1 : 0 + name = format("%s alerting workflow", var.service_name) + muting_rules_handling = "NOTIFY_ALL_ISSUES" + + issues_filter { + name = format("%s alerting workflow filter", var.service_name) + type = "FILTER" + + predicate { + attribute = "labels.policyIds" + operator = "EXACTLY_MATCHES" + values = [ newrelic_alert_policy.alert.id ] + } + } + + destination { + channel_id = newrelic_notification_channel.email-channel[0].id + } +} diff --git a/tf-managed/modules/daily-snapshot/monitoring/provider.tf b/tf-managed/modules/daily-snapshot/monitoring/provider.tf index 2e46564f7..992d8225f 100644 --- a/tf-managed/modules/daily-snapshot/monitoring/provider.tf +++ b/tf-managed/modules/daily-snapshot/monitoring/provider.tf @@ -1,5 +1,5 @@ terraform { - required_version = "~> 1.3" + required_version = "~> 1.6" required_providers { newrelic = { source = "newrelic/newrelic" diff --git a/tf-managed/modules/daily-snapshot/monitoring/variable.tf b/tf-managed/modules/daily-snapshot/monitoring/variable.tf index 80d99e59d..156b0e59f 100644 --- a/tf-managed/modules/daily-snapshot/monitoring/variable.tf +++ b/tf-managed/modules/daily-snapshot/monitoring/variable.tf @@ -8,3 +8,9 @@ variable "enable_slack_notifications" { type = bool default = false } + +variable "alert_email" { + description = "Email address to send alerts to" + type = string + default = "" +} diff --git a/tf-managed/modules/daily-snapshot/provider.tf b/tf-managed/modules/daily-snapshot/provider.tf index 7b86d4626..c553a31e0 100644 --- a/tf-managed/modules/daily-snapshot/provider.tf +++ b/tf-managed/modules/daily-snapshot/provider.tf @@ -1,5 +1,5 @@ terraform { - required_version = "~> 1.3" + required_version = "~> 1.6" required_providers { digitalocean = { diff --git a/tf-managed/modules/daily-snapshot/variable.tf b/tf-managed/modules/daily-snapshot/variable.tf index 4ff30b8f2..7e68fd33d 100644 --- a/tf-managed/modules/daily-snapshot/variable.tf +++ b/tf-managed/modules/daily-snapshot/variable.tf @@ -120,7 +120,14 @@ variable "environment" { } variable "monitoring" { - description = "Enable monitoring" - type = bool - default = false + description = "Service monitoring" + type = object({ + enable = optional(bool, false) + alert_email = optional(string, "") + }) + + default = { + enable = false, + alert_email = "" + } } diff --git a/tf-managed/modules/sync-check/provider.tf b/tf-managed/modules/sync-check/provider.tf index b92ed0b88..1f8588272 100644 --- a/tf-managed/modules/sync-check/provider.tf +++ b/tf-managed/modules/sync-check/provider.tf @@ -1,5 +1,5 @@ terraform { - required_version = "~> 1.3" + required_version = "~> 1.6" required_providers { digitalocean = { From 14a5d3104841600e655d2c9ab9a224cbcc8345bf Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Wed, 3 Jan 2024 18:07:37 +0100 Subject: [PATCH 15/46] slack notifications --- tf-managed/modules/daily-snapshot/main.tf | 3 ++ .../modules/daily-snapshot/monitoring/main.tf | 46 +++++++++++++++++-- .../daily-snapshot/monitoring/variable.tf | 18 ++++++-- tf-managed/modules/daily-snapshot/variable.tf | 17 ++++++- 4 files changed, 76 insertions(+), 8 deletions(-) diff --git a/tf-managed/modules/daily-snapshot/main.tf b/tf-managed/modules/daily-snapshot/main.tf index 0b24fa005..dac323ecb 100644 --- a/tf-managed/modules/daily-snapshot/main.tf +++ b/tf-managed/modules/daily-snapshot/main.tf @@ -110,4 +110,7 @@ module "monitoring" { source = "./monitoring" service_name = local.service_name alert_email = var.monitoring.alert_email + slack_enable = var.monitoring.slack_enable + slack_destination_id = var.monitoring.slack_destination_id + slack_channel_id = var.monitoring.slack_channel_id } diff --git a/tf-managed/modules/daily-snapshot/monitoring/main.tf b/tf-managed/modules/daily-snapshot/monitoring/main.tf index 8b65305e0..c2dee8d9b 100644 --- a/tf-managed/modules/daily-snapshot/monitoring/main.tf +++ b/tf-managed/modules/daily-snapshot/monitoring/main.tf @@ -12,7 +12,6 @@ resource "newrelic_nrql_alert_condition" "disk_space" { name = "High Disk Utilization" description = "Alert when disk space usage is high on an the service host" enabled = true - # violation_time_limit_seconds = 3600 nrql { query = "SELECT latest(diskUsedPercent) FROM StorageSample where entityName = '${var.service_name}'" @@ -59,10 +58,28 @@ resource "newrelic_notification_channel" "email-channel" { } } +resource "newrelic_notification_channel" "slack-channel" { + count = var.slack_enable ? 1 : 0 + name = format("%s slack", var.service_name) + type = "SLACK" + destination_id = var.slack_destination_id + product = "IINT" + + property { + key = "channelId" + value = var.slack_channel_id + } -resource "newrelic_workflow" "alerting-workflow" { + property { + key = "customDetailsSlack" + value = "issue id - {{issueId}}" + } +} + + +resource "newrelic_workflow" "alerting-workflow-mails" { count = local.enable_email ? 1 : 0 - name = format("%s alerting workflow", var.service_name) + name = format("%s mail alerting workflow", var.service_name) muting_rules_handling = "NOTIFY_ALL_ISSUES" issues_filter { @@ -80,3 +97,26 @@ resource "newrelic_workflow" "alerting-workflow" { channel_id = newrelic_notification_channel.email-channel[0].id } } + +# Limitation of NR provider - only one workflow can be created per channel. Might be resolved in the future. +# https://registry.terraform.io/providers/newrelic/newrelic/latest/docs/resources/workflow#nested-destination-blocks +resource "newrelic_workflow" "alerting-workflow-slack" { + count = var.slack_enable ? 1 : 0 + name = format("%s slack alerting workflow", var.service_name) + muting_rules_handling = "NOTIFY_ALL_ISSUES" + + issues_filter { + name = format("%s alerting workflow filter", var.service_name) + type = "FILTER" + + predicate { + attribute = "labels.policyIds" + operator = "EXACTLY_MATCHES" + values = [ newrelic_alert_policy.alert.id ] + } + } + + destination { + channel_id = newrelic_notification_channel.slack-channel[0].id + } +} diff --git a/tf-managed/modules/daily-snapshot/monitoring/variable.tf b/tf-managed/modules/daily-snapshot/monitoring/variable.tf index 156b0e59f..76499aa60 100644 --- a/tf-managed/modules/daily-snapshot/monitoring/variable.tf +++ b/tf-managed/modules/daily-snapshot/monitoring/variable.tf @@ -3,14 +3,24 @@ variable "service_name" { type = string } -variable "enable_slack_notifications" { +variable "alert_email" { + description = "Email address to send alerts to" + type = string + default = "" +} + +variable "slack_enable" { description = "Enable Slack notifications" type = bool default = false } -variable "alert_email" { - description = "Email address to send alerts to" +variable "slack_destination_id" { + description = "Slack destination id" + type = string +} + +variable "slack_channel_id" { + description = "Slack channel id" type = string - default = "" } diff --git a/tf-managed/modules/daily-snapshot/variable.tf b/tf-managed/modules/daily-snapshot/variable.tf index 7e68fd33d..f23ec2b38 100644 --- a/tf-managed/modules/daily-snapshot/variable.tf +++ b/tf-managed/modules/daily-snapshot/variable.tf @@ -119,15 +119,30 @@ variable "environment" { type = string } +# Monitoring properties of the service. Can be declared partially. variable "monitoring" { - description = "Service monitoring" + description = "Service monitoring properties" type = object({ + # Whether to enable monitoring on the service. enable = optional(bool, false) + # Email (or comma-separated emails) to send alerts to in case of incidents. If empty, disabled email alerts. alert_email = optional(string, "") + # Whether to enable Slack notifications on the given channel. + slack_enable = optional(bool, false) + # Due to the limitations of NewRelic, this needs to be manually created via UI. + # See Slack section in: + # https://registry.terraform.io/providers/newrelic/newrelic/latest/docs/resources/notification_destination + # https://docs.newrelic.com/docs/alerts-applied-intelligence/notifications/notification-integrations/#slack + slack_destination_id = optional(string, "") + # Unique Slack channel ID - can be obtained at the bottom of the channel properties. + slack_channel_id = optional(string, "") }) default = { enable = false, alert_email = "" + slack_enable = false, + slack_destination_id = "" + slack_channel_id = "" } } From f3ee148ec21fe14c9c0602e8af622d13bf025e2c Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Thu, 4 Jan 2024 16:42:02 +0100 Subject: [PATCH 16/46] add synthetic monitoring snapshots age --- .../snapshot-monitoring/terragrunt.hcl | 10 ++++++++ .../modules/snapshot-monitoring/main.tf | 16 +++++++++++++ .../modules/snapshot-monitoring/provider.tf | 15 ++++++++++++ .../snapshot-age-monitor.js | 21 ++++++++++++++++ .../modules/snapshot-monitoring/variable.tf | 24 +++++++++++++++++++ 5 files changed, 86 insertions(+) create mode 100644 tf-managed/live/environments/dev/applications/snapshot-monitoring/terragrunt.hcl create mode 100644 tf-managed/modules/snapshot-monitoring/main.tf create mode 100644 tf-managed/modules/snapshot-monitoring/provider.tf create mode 100644 tf-managed/modules/snapshot-monitoring/snapshot-age-monitor.js create mode 100644 tf-managed/modules/snapshot-monitoring/variable.tf diff --git a/tf-managed/live/environments/dev/applications/snapshot-monitoring/terragrunt.hcl b/tf-managed/live/environments/dev/applications/snapshot-monitoring/terragrunt.hcl new file mode 100644 index 000000000..a92cb9f18 --- /dev/null +++ b/tf-managed/live/environments/dev/applications/snapshot-monitoring/terragrunt.hcl @@ -0,0 +1,10 @@ +# Automatically find the root terragrunt.hcl and inherit its +# configuration +include "root" { + path = find_in_parent_folders() +} + +# Load the actual Terraform module +terraform { + source = format("%s/../modules/snapshot-monitoring", get_parent_terragrunt_dir()) +} diff --git a/tf-managed/modules/snapshot-monitoring/main.tf b/tf-managed/modules/snapshot-monitoring/main.tf new file mode 100644 index 000000000..01930be72 --- /dev/null +++ b/tf-managed/modules/snapshot-monitoring/main.tf @@ -0,0 +1,16 @@ +resource "newrelic_synthetics_script_monitor" "snapshot-age-monitor" { + status = "ENABLED" + name = format("%s-snapshot-age-monitor", var.environment) + type = "SCRIPT_API" + + # https://docs.newrelic.com/docs/synthetics/synthetic-monitoring/administration/synthetic-public-minion-ips/#public-minion-locations-and-location-labels-location + # TODO - parameterize this + locations_public = ["AP_SOUTHEAST_1", "US_WEST_1", "EU_CENTRAL_1"] + period = "EVERY_HOUR" + script = file("snapshot-age-monitor.js") + + tag { + key = "service" + values = ["forest-snapshot"] + } +} diff --git a/tf-managed/modules/snapshot-monitoring/provider.tf b/tf-managed/modules/snapshot-monitoring/provider.tf new file mode 100644 index 000000000..987d73c13 --- /dev/null +++ b/tf-managed/modules/snapshot-monitoring/provider.tf @@ -0,0 +1,15 @@ +terraform { + required_version = "~> 1.6" + required_providers { + newrelic = { + source = "newrelic/newrelic" + version = "~> 3.0" + } + } +} + +provider "newrelic" { + account_id = var.new_relic_account_id + api_key = var.new_relic_api_key + region = var.new_relic_region +} diff --git a/tf-managed/modules/snapshot-monitoring/snapshot-age-monitor.js b/tf-managed/modules/snapshot-monitoring/snapshot-age-monitor.js new file mode 100644 index 000000000..7869b758d --- /dev/null +++ b/tf-managed/modules/snapshot-monitoring/snapshot-age-monitor.js @@ -0,0 +1,21 @@ +var assert = require('assert'); + +function check_snapshot(url, genesisTime) { + var callback = function (err, response, body) { + assert.equal(response.statusCode, 200, 'Expected a 200 OK response'); + + var snapshotName = response.url.split('/').pop(); + var height = snapshotName.match(/height_(\d+)/)[1]; + + var currentTime = Math.floor(Date.now() / 1000); + var snapshotTime = height * 30 + genesisTime; + var snapshotAgeInMinutes = (currentTime - snapshotTime) / 60; + + assert(snapshotAgeInMinutes < 360, 'Expected snapshot to be less than 360 minutes old'); + } + + $http.head(url, callback) +} + +check_snapshot('https://forest-archive.chainsafe.dev/latest/calibnet/', 1667326380) +check_snapshot('https://forest-archive.chainsafe.dev/latest/mainnet/', 1598306400) diff --git a/tf-managed/modules/snapshot-monitoring/variable.tf b/tf-managed/modules/snapshot-monitoring/variable.tf new file mode 100644 index 000000000..c2dd4debb --- /dev/null +++ b/tf-managed/modules/snapshot-monitoring/variable.tf @@ -0,0 +1,24 @@ +variable "environment" { + description = "The environment to deploy to" + type = string +} + +variable "new_relic_region" { + description = "The New Relic Platform Region" + type = string + default = "EU" +} + +variable "new_relic_api_key" { + description = "New Relic API KEY" + default = "" + type = string + sensitive = true +} + +variable "new_relic_account_id" { + description = "The New Relic Account ID" + default = 0 + type = number + sensitive = true +} From d68ee83a09283e9bddbe9d83a4ef55ec4e408bb6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 4 Jan 2024 15:42:29 +0000 Subject: [PATCH 17/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tf-managed/modules/snapshot-monitoring/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tf-managed/modules/snapshot-monitoring/main.tf b/tf-managed/modules/snapshot-monitoring/main.tf index 01930be72..bb88719d2 100644 --- a/tf-managed/modules/snapshot-monitoring/main.tf +++ b/tf-managed/modules/snapshot-monitoring/main.tf @@ -2,7 +2,7 @@ resource "newrelic_synthetics_script_monitor" "snapshot-age-monitor" { status = "ENABLED" name = format("%s-snapshot-age-monitor", var.environment) type = "SCRIPT_API" - + # https://docs.newrelic.com/docs/synthetics/synthetic-monitoring/administration/synthetic-public-minion-ips/#public-minion-locations-and-location-labels-location # TODO - parameterize this locations_public = ["AP_SOUTHEAST_1", "US_WEST_1", "EU_CENTRAL_1"] From fd8f1cd4ef6d7ee4db02a048e4b117121daf9f6c Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Thu, 4 Jan 2024 16:52:42 +0100 Subject: [PATCH 18/46] bolt new node version --- tf-managed/modules/snapshot-monitoring/main.tf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tf-managed/modules/snapshot-monitoring/main.tf b/tf-managed/modules/snapshot-monitoring/main.tf index bb88719d2..679cf2bcd 100644 --- a/tf-managed/modules/snapshot-monitoring/main.tf +++ b/tf-managed/modules/snapshot-monitoring/main.tf @@ -9,6 +9,10 @@ resource "newrelic_synthetics_script_monitor" "snapshot-age-monitor" { period = "EVERY_HOUR" script = file("snapshot-age-monitor.js") + script_language = "JAVASCRIPT" + runtime_type = "NODE_API" + runtime_type_version = "16.10" + tag { key = "service" values = ["forest-snapshot"] From 3b3a3950a9df892cba08d826e4c40f8f0a5c8742 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Fri, 5 Jan 2024 17:12:15 +0100 Subject: [PATCH 19/46] log as metrics --- tf-managed/modules/daily-snapshot/main.tf | 1 + .../daily-snapshot/monitoring/event_rules.tf | 26 +++++++++++++++++++ .../daily-snapshot/monitoring/variable.tf | 5 ++++ .../daily-snapshot/service/calibnet_cron_job | 4 +-- .../daily-snapshot/service/daily_snapshot.rb | 14 +++++++--- .../modules/daily-snapshot/service/init.sh | 2 +- 6 files changed, 46 insertions(+), 6 deletions(-) create mode 100644 tf-managed/modules/daily-snapshot/monitoring/event_rules.tf diff --git a/tf-managed/modules/daily-snapshot/main.tf b/tf-managed/modules/daily-snapshot/main.tf index dac323ecb..dbdca40fa 100644 --- a/tf-managed/modules/daily-snapshot/main.tf +++ b/tf-managed/modules/daily-snapshot/main.tf @@ -113,4 +113,5 @@ module "monitoring" { slack_enable = var.monitoring.slack_enable slack_destination_id = var.monitoring.slack_destination_id slack_channel_id = var.monitoring.slack_channel_id + new_relic_account_id = var.new_relic_account_id } diff --git a/tf-managed/modules/daily-snapshot/monitoring/event_rules.tf b/tf-managed/modules/daily-snapshot/monitoring/event_rules.tf new file mode 100644 index 000000000..4a50e5fb7 --- /dev/null +++ b/tf-managed/modules/daily-snapshot/monitoring/event_rules.tf @@ -0,0 +1,26 @@ +resource "newrelic_events_to_metrics_rule" "generate_snapshot_attempt_metrics" { + account_id = var.new_relic_account_id + for_each = toset(["mainnet", "calibnet"]) + + name = format("%s %s snapshot generation attempts", var.service_name, each.key) + description = "Snapshot generation attempts" + nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_run' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%running snapshot export%'" +} + +resource "newrelic_events_to_metrics_rule" "generate_snapshot_success_metrics" { + account_id = var.new_relic_account_id + for_each = toset(["mainnet", "calibnet"]) + + name = format("%s %s snapshot generation success", var.service_name, each.key) + description = "Success snapshot generations" + nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_ok' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%Snapshot uploaded for%'" +} + +resource "newrelic_events_to_metrics_rule" "generate_snapshot_fail_metrics" { + account_id = var.new_relic_account_id + for_each = toset(["mainnet", "calibnet"]) + + name = format("%s %s snapshot generation failure", var.service_name, each.key) + description = "Failed snapshot generations" + nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_fail' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%Snapshot upload failed for%'" +} diff --git a/tf-managed/modules/daily-snapshot/monitoring/variable.tf b/tf-managed/modules/daily-snapshot/monitoring/variable.tf index 76499aa60..484be045e 100644 --- a/tf-managed/modules/daily-snapshot/monitoring/variable.tf +++ b/tf-managed/modules/daily-snapshot/monitoring/variable.tf @@ -24,3 +24,8 @@ variable "slack_channel_id" { description = "Slack channel id" type = string } + +variable "new_relic_account_id" { + description = "New Relic account id" + type = number +} diff --git a/tf-managed/modules/daily-snapshot/service/calibnet_cron_job b/tf-managed/modules/daily-snapshot/service/calibnet_cron_job index a492ad45d..e1b5fdfa0 100755 --- a/tf-managed/modules/daily-snapshot/service/calibnet_cron_job +++ b/tf-managed/modules/daily-snapshot/service/calibnet_cron_job @@ -3,5 +3,5 @@ # shellcheck source=/dev/null source ~/.forest_env cd "$BASE_FOLDER" || exit -flock -n /tmp/calibnet.lock -c "ruby daily_snapshot.rb calibnet > logs/calibnet_log.txt 2>&1" -flock -n /tmp/calibnet_filops.lock -c "./upload_filops_snapshot.sh calibnet > logs/filops_calibnet_log.txt 2>&1" +flock -n /tmp/calibnet.lock -c "ruby daily_snapshot.rb calibnet >> logs/calibnet_log.txt 2>&1" +flock -n /tmp/calibnet_filops.lock -c "./upload_filops_snapshot.sh calibnet >> logs/filops_calibnet_log.txt 2>&1" diff --git a/tf-managed/modules/daily-snapshot/service/daily_snapshot.rb b/tf-managed/modules/daily-snapshot/service/daily_snapshot.rb index dcc3be7c0..39344382d 100644 --- a/tf-managed/modules/daily-snapshot/service/daily_snapshot.rb +++ b/tf-managed/modules/daily-snapshot/service/daily_snapshot.rb @@ -22,6 +22,7 @@ LOG_EXPORT_METRICS = "logs/#{CHAIN_NAME}_#{DATE}_metrics.txt" client = SlackClient.new CHANNEL, SLACK_TOKEN +logger = Logger.new($stdout) # conditionally add timestamps to logs without timestamps add_timestamps_cmd = %q[awk '{ if ($0 !~ /^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{6}Z/) print strftime("[%Y-%m-%d %H:%M:%S]"), $0; else print $0; fflush(); }'] @@ -29,9 +30,16 @@ timeout --signal=KILL 8h ./upload_snapshot.sh #{CHAIN_NAME} #{LOG_EXPORT_DAEMON} #{LOG_EXPORT_METRICS} | #{add_timestamps_cmd}" # The command needs to be run indirectly to avoid syntax errors in the shell. +logger.info "Running snapshot export script for #{CHAIN_NAME}..." snapshot_uploaded = system('bash', '-c', upload_cmd, %i[out err] => LOG_EXPORT_SCRIPT_RUN) - -unless snapshot_uploaded +logger.info "Snapshot export script finished for #{CHAIN_NAME}." + +if snapshot_uploaded + # This log message is important, as it is used by the monitoring tools to determine whether the snapshot was + # successfully uploaded. + logger.info "Snapshot uploaded for #{CHAIN_NAME}." +else + logger.error "Snapshot upload failed for #{CHAIN_NAME}." client.post_message "⛔ Snapshot failed for #{CHAIN_NAME}. 🔥🌲🔥 " # attach the log file and print the contents to STDOUT [LOG_EXPORT_SCRIPT_RUN, LOG_EXPORT_DAEMON, LOG_EXPORT_METRICS].each do |log_file| @@ -40,5 +48,5 @@ end [LOG_EXPORT_SCRIPT_RUN, LOG_EXPORT_DAEMON, LOG_EXPORT_METRICS].each do |log_file| - puts "Snapshot export log:\n#{File.read(log_file)}\n\n" if File.exist?(log_file) + logger.info "Snapshot export log:\n#{File.read(log_file)}\n\n" if File.exist?(log_file) end diff --git a/tf-managed/modules/daily-snapshot/service/init.sh b/tf-managed/modules/daily-snapshot/service/init.sh index 2ad8d923a..3d19b6dfb 100755 --- a/tf-managed/modules/daily-snapshot/service/init.sh +++ b/tf-managed/modules/daily-snapshot/service/init.sh @@ -35,7 +35,7 @@ mkdir --parents -- "$BASE_FOLDER/forest_db/filops" chmod +x ./upload_filops_snapshot.sh # Run new_relic and fail2ban scripts -bash newrelic_fail2ban.sh & +bash newrelic_fail2ban.sh # Setup cron jobs cp calibnet_cron_job mainnet_cron_job /etc/cron.hourly/ From 8a9bb7d3734a17fc590a7ed930820086b0fcdf54 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Thu, 11 Jan 2024 17:41:49 +0100 Subject: [PATCH 20/46] cleanups, tinkering --- .../applications/sync-check/terragrunt.hcl | 8 +- tf-managed/live/terragrunt.hcl | 32 ++-- tf-managed/modules/daily-snapshot/main.tf | 12 +- .../daily-snapshot/monitoring/event_rules.tf | 20 ++- .../modules/daily-snapshot/monitoring/main.tf | 161 ++++++++++++------ .../modules/daily-snapshot/prep_sources.sh | 2 +- .../service/newrelic_fail2ban.sh | 13 +- tf-managed/modules/daily-snapshot/variable.tf | 12 +- .../modules/snapshot-monitoring/main.tf | 12 +- tf-managed/modules/sync-check/main.tf | 2 +- tf-managed/modules/sync-check/prep_sources.sh | 2 +- tf-managed/modules/sync-check/variables.tf | 2 +- 12 files changed, 173 insertions(+), 105 deletions(-) diff --git a/tf-managed/live/environments/prod/applications/sync-check/terragrunt.hcl b/tf-managed/live/environments/prod/applications/sync-check/terragrunt.hcl index 4ae7a2e1c..937c00e05 100644 --- a/tf-managed/live/environments/prod/applications/sync-check/terragrunt.hcl +++ b/tf-managed/live/environments/prod/applications/sync-check/terragrunt.hcl @@ -10,9 +10,9 @@ terraform { } inputs = { - name = "forest-snapshot" - size = "s-4vcpu-16gb-amd" - r2_endpoint = "https://2238a825c5aca59233eab1f221f7aefb.r2.cloudflarestorage.com/" - forest_tag = "v0.16.4" + name = "forest-snapshot" + size = "s-4vcpu-16gb-amd" + r2_endpoint = "https://2238a825c5aca59233eab1f221f7aefb.r2.cloudflarestorage.com/" + forest_tag = "v0.16.4" snapshot_bucket = "forest-archive" } diff --git a/tf-managed/live/terragrunt.hcl b/tf-managed/live/terragrunt.hcl index 4068b4372..96655be36 100644 --- a/tf-managed/live/terragrunt.hcl +++ b/tf-managed/live/terragrunt.hcl @@ -10,28 +10,28 @@ locals { remote_state { backend = "s3" generate = { - path = "backend.tf" + path = "backend.tf" if_exists = "overwrite_terragrunt" } config = { // if the environment is dev, use the dev bucket, otherwise use the prod bucket bucket = (local.env == "prod" - ? "hubert-bucket-prod" - : "hubert-bucket-dev" - ) - key = "${path_relative_to_include()}/terraform.tfstate" - region = "eu-west-1" - endpoint = "https://fra1.digitaloceanspaces.com" - skip_bucket_versioning = true - skip_bucket_ssencryption = true - skip_bucket_root_access = true + ? "hubert-bucket-prod" + : "hubert-bucket-dev" + ) + key = "${path_relative_to_include()}/terraform.tfstate" + region = "eu-west-1" + endpoint = "https://fra1.digitaloceanspaces.com" + skip_bucket_versioning = true + skip_bucket_ssencryption = true + skip_bucket_root_access = true skip_bucket_public_access_blocking = true - skip_bucket_enforced_tls = true - skip_credentials_validation = true - skip_metadata_api_check = true - skip_requesting_account_id = true - skip_s3_checksum = true - skip_region_validation = true + skip_bucket_enforced_tls = true + skip_credentials_validation = true + skip_metadata_api_check = true + skip_requesting_account_id = true + skip_s3_checksum = true + skip_region_validation = true } } diff --git a/tf-managed/modules/daily-snapshot/main.tf b/tf-managed/modules/daily-snapshot/main.tf index dbdca40fa..26b343043 100644 --- a/tf-managed/modules/daily-snapshot/main.tf +++ b/tf-managed/modules/daily-snapshot/main.tf @@ -106,12 +106,12 @@ resource "digitalocean_project_resources" "connect_forest_project" { } module "monitoring" { - count = var.monitoring.enable ? 1 : 0 - source = "./monitoring" - service_name = local.service_name - alert_email = var.monitoring.alert_email - slack_enable = var.monitoring.slack_enable + count = var.monitoring.enable ? 1 : 0 + source = "./monitoring" + service_name = local.service_name + alert_email = var.monitoring.alert_email + slack_enable = var.monitoring.slack_enable slack_destination_id = var.monitoring.slack_destination_id - slack_channel_id = var.monitoring.slack_channel_id + slack_channel_id = var.monitoring.slack_channel_id new_relic_account_id = var.new_relic_account_id } diff --git a/tf-managed/modules/daily-snapshot/monitoring/event_rules.tf b/tf-managed/modules/daily-snapshot/monitoring/event_rules.tf index 4a50e5fb7..a7381dada 100644 --- a/tf-managed/modules/daily-snapshot/monitoring/event_rules.tf +++ b/tf-managed/modules/daily-snapshot/monitoring/event_rules.tf @@ -1,26 +1,28 @@ +# This file constains NR event rules used to generate metrics from logs, given that +# the service is not generating metrics by itself. resource "newrelic_events_to_metrics_rule" "generate_snapshot_attempt_metrics" { account_id = var.new_relic_account_id - for_each = toset(["mainnet", "calibnet"]) + for_each = toset(["mainnet", "calibnet"]) - name = format("%s %s snapshot generation attempts", var.service_name, each.key) + name = format("%s %s snapshot generation attempts", var.service_name, each.key) description = "Snapshot generation attempts" - nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_run' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%running snapshot export%'" + nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_run' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%running snapshot export%'" } resource "newrelic_events_to_metrics_rule" "generate_snapshot_success_metrics" { account_id = var.new_relic_account_id - for_each = toset(["mainnet", "calibnet"]) + for_each = toset(["mainnet", "calibnet"]) - name = format("%s %s snapshot generation success", var.service_name, each.key) + name = format("%s %s snapshot generation success", var.service_name, each.key) description = "Success snapshot generations" - nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_ok' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%Snapshot uploaded for%'" + nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_ok' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%Snapshot uploaded for%'" } resource "newrelic_events_to_metrics_rule" "generate_snapshot_fail_metrics" { account_id = var.new_relic_account_id - for_each = toset(["mainnet", "calibnet"]) + for_each = toset(["mainnet", "calibnet"]) - name = format("%s %s snapshot generation failure", var.service_name, each.key) + name = format("%s %s snapshot generation failure", var.service_name, each.key) description = "Failed snapshot generations" - nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_fail' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%Snapshot upload failed for%'" + nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_fail' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%Snapshot upload failed for%'" } diff --git a/tf-managed/modules/daily-snapshot/monitoring/main.tf b/tf-managed/modules/daily-snapshot/monitoring/main.tf index c2dee8d9b..a96e46e4f 100644 --- a/tf-managed/modules/daily-snapshot/monitoring/main.tf +++ b/tf-managed/modules/daily-snapshot/monitoring/main.tf @@ -6,80 +6,80 @@ locals { enable_email = var.alert_email != "" } -resource "newrelic_nrql_alert_condition" "disk_space" { - policy_id = newrelic_alert_policy.alert.id - type = "static" - name = "High Disk Utilization" - description = "Alert when disk space usage is high on an the service host" - enabled = true - - nrql { - query = "SELECT latest(diskUsedPercent) FROM StorageSample where entityName = '${var.service_name}'" - } - - critical { - operator = "above" - # threshold = 85.0 - threshold = 20.0 - threshold_duration = 300 - threshold_occurrences = "ALL" - } - - warning { - operator = "above" - # threshold = 70.0 - threshold = 10.0 - threshold_duration = 300 - threshold_occurrences = "ALL" - } -} +# resource "newrelic_nrql_alert_condition" "disk_space" { +# policy_id = newrelic_alert_policy.alert.id +# type = "static" +# name = "High Disk Utilization" +# description = "Alert when disk space usage is high on an the service host" +# enabled = true +# +# nrql { +# query = "SELECT latest(diskUsedPercent) FROM StorageSample where entityName = '${var.service_name}'" +# } +# +# critical { +# operator = "above" +# # threshold = 85.0 +# threshold = 20.0 +# threshold_duration = 300 +# threshold_occurrences = "ALL" +# } +# +# warning { +# operator = "above" +# # threshold = 70.0 +# threshold = 10.0 +# threshold_duration = 300 +# threshold_occurrences = "ALL" +# } +# } resource "newrelic_notification_destination" "email" { count = local.enable_email ? 1 : 0 name = format("%s email", var.service_name) - type = "EMAIL" + type = "EMAIL" property { - key = "email" + key = "email" value = var.alert_email } } resource "newrelic_notification_channel" "email-channel" { - count = local.enable_email ? 1 : 0 - name = format("%s email", var.service_name) - type = "EMAIL" - product = "IINT" + count = local.enable_email ? 1 : 0 + name = format("%s email", var.service_name) + type = "EMAIL" + product = "IINT" destination_id = newrelic_notification_destination.email[0].id property { - key = "subject" + key = "subject" value = format("%s alert", var.service_name) } } resource "newrelic_notification_channel" "slack-channel" { - count = var.slack_enable ? 1 : 0 - name = format("%s slack", var.service_name) - type = "SLACK" + count = var.slack_enable ? 1 : 0 + name = format("%s slack", var.service_name) + type = "SLACK" destination_id = var.slack_destination_id - product = "IINT" + product = "IINT" property { - key = "channelId" + key = "channelId" value = var.slack_channel_id } property { - key = "customDetailsSlack" + key = "customDetailsSlack" value = "issue id - {{issueId}}" } } resource "newrelic_workflow" "alerting-workflow-mails" { - count = local.enable_email ? 1 : 0 - name = format("%s mail alerting workflow", var.service_name) + count = local.enable_email ? 1 : 0 + name = format("%s mail alerting workflow", var.service_name) muting_rules_handling = "NOTIFY_ALL_ISSUES" issues_filter { @@ -88,8 +88,8 @@ resource "newrelic_workflow" "alerting-workflow-mails" { predicate { attribute = "labels.policyIds" - operator = "EXACTLY_MATCHES" - values = [ newrelic_alert_policy.alert.id ] + operator = "EXACTLY_MATCHES" + values = [newrelic_alert_policy.alert.id] } } @@ -101,8 +101,8 @@ resource "newrelic_workflow" "alerting-workflow-mails" { # Limitation of NR provider - only one workflow can be created per channel. Might be resolved in the future. # https://registry.terraform.io/providers/newrelic/newrelic/latest/docs/resources/workflow#nested-destination-blocks resource "newrelic_workflow" "alerting-workflow-slack" { - count = var.slack_enable ? 1 : 0 - name = format("%s slack alerting workflow", var.service_name) + count = var.slack_enable ? 1 : 0 + name = format("%s slack alerting workflow", var.service_name) muting_rules_handling = "NOTIFY_ALL_ISSUES" issues_filter { @@ -111,8 +111,8 @@ resource "newrelic_workflow" "alerting-workflow-slack" { predicate { attribute = "labels.policyIds" - operator = "EXACTLY_MATCHES" - values = [ newrelic_alert_policy.alert.id ] + operator = "EXACTLY_MATCHES" + values = [newrelic_alert_policy.alert.id] } } @@ -120,3 +120,68 @@ resource "newrelic_workflow" "alerting-workflow-slack" { channel_id = newrelic_notification_channel.slack-channel[0].id } } + +# At least 1 snapshot is generated in 5 hours interval +resource "newrelic_nrql_alert_condition" "snapshot_frequency_condition" { + for_each = toset(["mainnet", "calibnet"]) + policy_id = newrelic_alert_policy.alert.id + type = "static" + name = format("Low snapshot generation frequency - %s", each.key) + description = "Alert when snapshots are not generated within requried time interval" + enabled = true + + # evaluation_delay = 7200 # 2 hours, it may take some time to generate a snapshot + # aggregation_window = 14400 # 4 hours, it may take some time to generate a snapshot + aggregation_window = 360 # 4 hours, it may take some time to generate a snapshot + + + nrql { + query = format("FROM Metric SELECT count(`${var.service_name}.${each.key}.snapshot_generation_ok`)") + } + + warning { + operator = "below" + threshold = 1 + # threshold_duration = 14400 + threshold_duration = 360 + threshold_occurrences = "ALL" + } + + critical { + operator = "below" + threshold = 1 + # threshold_duration = 28800 + threshold_duration = 720 + threshold_occurrences = "ALL" + } +} + +# At least 1 successful snapshot out of 3 attempts + +#resource "newrelic_nrql_alert_condition" "disk_space" { +# policy_id = newrelic_alert_policy.alert.id +# type = "static" +# name = "High Disk Utilization" +# description = "Alert when disk space usage is high on an the service host" +# enabled = true +# +# nrql { +# query = "SELECT latest(diskUsedPercent) FROM StorageSample where entityName = '${var.service_name}'" +# } +# +# critical { +# operator = "above" +# # threshold = 85.0 +# threshold = 20.0 +# threshold_duration = 300 +# threshold_occurrences = "ALL" +# } +# +# warning { +# operator = "above" +# # threshold = 70.0 +# threshold = 10.0 +# threshold_duration = 300 +# threshold_occurrences = "ALL" +# } +#} diff --git a/tf-managed/modules/daily-snapshot/prep_sources.sh b/tf-managed/modules/daily-snapshot/prep_sources.sh index 6324e5da4..05d9a16df 100755 --- a/tf-managed/modules/daily-snapshot/prep_sources.sh +++ b/tf-managed/modules/daily-snapshot/prep_sources.sh @@ -6,7 +6,7 @@ set -euxo pipefail # Copy local source files in a folder together with ruby_common and create a zip archive. cd "$1" -cp --archive $2/ruby_common service/ +cp --archive "$2"/ruby_common service/ rm -f sources.tar (cd service && tar cf ../sources.tar --sort=name --mtime='UTC 2019-01-01' ./* > /dev/null 2>&1) diff --git a/tf-managed/modules/daily-snapshot/service/newrelic_fail2ban.sh b/tf-managed/modules/daily-snapshot/service/newrelic_fail2ban.sh index 7e608ff65..00d885dcc 100644 --- a/tf-managed/modules/daily-snapshot/service/newrelic_fail2ban.sh +++ b/tf-managed/modules/daily-snapshot/service/newrelic_fail2ban.sh @@ -17,11 +17,11 @@ if [ -n "$NEW_RELIC_API_KEY" ] ; then # https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent/configuration/infrastructure-agent-configuration-settings/#offline-time-to-reset cat >> /etc/newrelic-infra.yml < /dev/null 2>&1) diff --git a/tf-managed/modules/sync-check/variables.tf b/tf-managed/modules/sync-check/variables.tf index 232caee43..a2f439fcd 100644 --- a/tf-managed/modules/sync-check/variables.tf +++ b/tf-managed/modules/sync-check/variables.tf @@ -64,7 +64,7 @@ variable "NEW_RELIC_ACCOUNT_ID" { } variable "common_resources_dir" { - type = string + type = string } variable "environment" { From fd1273d7084018bfd1207169cd232612defbcfb5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 11 Jan 2024 16:44:36 +0000 Subject: [PATCH 21/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tf-managed/modules/daily-snapshot/monitoring/main.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tf-managed/modules/daily-snapshot/monitoring/main.tf b/tf-managed/modules/daily-snapshot/monitoring/main.tf index a96e46e4f..88ff815c6 100644 --- a/tf-managed/modules/daily-snapshot/monitoring/main.tf +++ b/tf-managed/modules/daily-snapshot/monitoring/main.tf @@ -12,11 +12,11 @@ locals { # name = "High Disk Utilization" # description = "Alert when disk space usage is high on an the service host" # enabled = true -# +# # nrql { # query = "SELECT latest(diskUsedPercent) FROM StorageSample where entityName = '${var.service_name}'" # } -# +# # critical { # operator = "above" # # threshold = 85.0 @@ -24,7 +24,7 @@ locals { # threshold_duration = 300 # threshold_occurrences = "ALL" # } -# +# # warning { # operator = "above" # # threshold = 70.0 From 4c78c9145c5a762734f6164fd8738a185bd54453 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Thu, 11 Jan 2024 17:45:28 +0100 Subject: [PATCH 22/46] js lint --- .../snapshot-age-monitor.js | 31 +++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/tf-managed/modules/snapshot-monitoring/snapshot-age-monitor.js b/tf-managed/modules/snapshot-monitoring/snapshot-age-monitor.js index 7869b758d..3efbaaf35 100644 --- a/tf-managed/modules/snapshot-monitoring/snapshot-age-monitor.js +++ b/tf-managed/modules/snapshot-monitoring/snapshot-age-monitor.js @@ -1,21 +1,34 @@ -var assert = require('assert'); +// eslint-disable-next-line @typescript-eslint/no-var-requires -- "approved" methods of resolving this lint do not work in the NR context. +var assert = require("assert"); function check_snapshot(url, genesisTime) { - var callback = function (err, response, body) { - assert.equal(response.statusCode, 200, 'Expected a 200 OK response'); + // eslint-disable-next-line @typescript-eslint/no-unused-vars -- that's how the callback works in this context. + var callback = function (_err, response, _body) { + assert.equal(response.statusCode, 200, "Expected a 200 OK response"); - var snapshotName = response.url.split('/').pop(); + var snapshotName = response.url.split("/").pop(); var height = snapshotName.match(/height_(\d+)/)[1]; var currentTime = Math.floor(Date.now() / 1000); var snapshotTime = height * 30 + genesisTime; var snapshotAgeInMinutes = (currentTime - snapshotTime) / 60; - assert(snapshotAgeInMinutes < 360, 'Expected snapshot to be less than 360 minutes old'); - } + assert( + snapshotAgeInMinutes < 360, + "Expected snapshot to be less than 360 minutes old" + ); + }; - $http.head(url, callback) + // This variable is provided by New Relic. + // eslint-disable-next-line no-undef + $http.head(url, callback); } -check_snapshot('https://forest-archive.chainsafe.dev/latest/calibnet/', 1667326380) -check_snapshot('https://forest-archive.chainsafe.dev/latest/mainnet/', 1598306400) +check_snapshot( + "https://forest-archive.chainsafe.dev/latest/calibnet/", + 1667326380 +); +check_snapshot( + "https://forest-archive.chainsafe.dev/latest/mainnet/", + 1598306400 +); From 6ab347655c63f1ae5ea627fdb7eed0e61a110840 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Mon, 15 Jan 2024 15:47:26 +0100 Subject: [PATCH 23/46] add `live/` docs --- tf-managed/live/.gitignore | 3 ++ tf-managed/live/Makefile | 7 ++++ tf-managed/live/README.md | 84 +++++++++++++++++++++++++++++++++++++- 3 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 tf-managed/live/Makefile diff --git a/tf-managed/live/.gitignore b/tf-managed/live/.gitignore index 466339266..4f2d3e9f5 100644 --- a/tf-managed/live/.gitignore +++ b/tf-managed/live/.gitignore @@ -13,3 +13,6 @@ vendor .ruby-version .terraform.lock.hcl .DS_Store + +# Personal development environments +environments/dev-* diff --git a/tf-managed/live/Makefile b/tf-managed/live/Makefile new file mode 100644 index 000000000..bd34788f0 --- /dev/null +++ b/tf-managed/live/Makefile @@ -0,0 +1,7 @@ +# Creates a new environment for development from the base one. +environment: + $(eval export ENVIRONMENT=dev-$(shell cat /dev/urandom | tr -dc 'a-z0-9' | fold -w 8 | head -n 1)) + @cp -r environments/dev environments/$(ENVIRONMENT) + @echo "Environment: $(ENVIRONMENT). Happy hacking!" + +.PHONY: environment diff --git a/tf-managed/live/README.md b/tf-managed/live/README.md index 8f0005319..34137f8a1 100644 --- a/tf-managed/live/README.md +++ b/tf-managed/live/README.md @@ -1 +1,83 @@ -All Terragrunt configurations live here. To edit Terraform files, go to ../modules +All Terragrunt configurations live here. To edit Terraform files, go to `../modules`. + +# Summary + +The Terragrunt configurations manage the actual environments and, in principle, should reflect the current state of the given environment. + +# Development +As a developer, you should create your own environment, separated from the others. In this directory, execute `make environment` and one will be created for you. Do not work on the `dev` environment directly as others may be working on it as well. + +``` +❯ make environment +Environment: dev-7zryf85r. Happy hacking! +``` + +After ensuring the changes work correctly, merge the changes from your development environment to the base one and, possibly, `prod`. + + +# Conventions + +## Environments + +There is currently no notion of `staging` environment, though one may be introduced in the future. + +``` +. +├── dev # Development environment template for custom environments. +├── dev- # Personal development environment +└── prod # Production environment. Should reflect reality. +``` + +The `prod` environment should be deployed only by GH worker and not manually. + +Each environment contains its respective `applications/`. A `base-infrastructure` may be created in the future to denote resources shared between applications. Each application should contain a single `terragrunt.hcl` file which only sets its configuration and, optionally, defines dependencies. The application code itself should be defined in `../modules`. + + +``` +└── applications + ├── snapshot-monitoring + │   └── terragrunt.hcl + ├── snapshot-service + │   └── terragrunt.hcl + └── sync-check + └── terragrunt.hcl +``` + +The difference between a `prod` and a `dev` application should be minimal. This would include a different Slack notification channel (which is already handled by the root `terragrunt.hcl`) or using a larger instances for `prod` environment. + +## Tags + +Everywhere where it's applicable, the resources should include the following tags: +- `iac` - indicates the resource is governed by Terraform and should not be mutated outside of the infrastructure code, +- `` - indicates the environment name. + +# Secrets + +There are several secrets that need to be defined and provided for the services to work. You can find them in the team's password manager. Each service defines their own set required variables, though all need access to DigitalOcean. See modules' documentation for more details. + +``` +################################# +### Required for all services ### +################################# +# DigitalOcean personal access token: https://cloud.digitalocean.com/account/api/tokens +export TF_VAR_digitalocean_token= + +# S3 access keys used by Terraform for the remote state. +export AWS_ACCESS_KEY_ID= +export AWS_SECRET_ACCESS_KEY= + +################################# +####### Service-specific ######## +################################# + +# Required for services with Slack notifications +export TF_VAR_slack_token= + +# Required for access to Cloudflare R2 +export TF_VAR_R2_ACCESS_KEY= +export TF_VAR_R2_SECRET_KEY= + +# Required if NewRelic monitoring/alerting is enabled. +export TF_VAR_new_relic_api_key= +export TF_VAR_new_relic_account_id= +``` From dddf4c8a0d7d6c0e80275129385bd737d8033311 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Mon, 15 Jan 2024 16:28:36 +0100 Subject: [PATCH 24/46] more docs --- tf-managed/live/README.md | 14 +++++++++++++- tf-managed/live/terragrunt.hcl | 10 ++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/tf-managed/live/README.md b/tf-managed/live/README.md index 34137f8a1..33c2556ee 100644 --- a/tf-managed/live/README.md +++ b/tf-managed/live/README.md @@ -1,7 +1,6 @@ All Terragrunt configurations live here. To edit Terraform files, go to `../modules`. # Summary - The Terragrunt configurations manage the actual environments and, in principle, should reflect the current state of the given environment. # Development @@ -12,8 +11,20 @@ As a developer, you should create your own environment, separated from the other Environment: dev-7zryf85r. Happy hacking! ``` +Inside the specific application in the environment, run: +``` +❯ terragrunt plan +``` + +This should show you the resources to be changed/created/destroyed. +``` +❯ terragrunt apply +``` + After ensuring the changes work correctly, merge the changes from your development environment to the base one and, possibly, `prod`. +Remember to cleanup your environment. Use `terragrunt destroy`. + # Conventions @@ -81,3 +92,4 @@ export TF_VAR_R2_SECRET_KEY= export TF_VAR_new_relic_api_key= export TF_VAR_new_relic_account_id= ``` + diff --git a/tf-managed/live/terragrunt.hcl b/tf-managed/live/terragrunt.hcl index 96655be36..45c337123 100644 --- a/tf-managed/live/terragrunt.hcl +++ b/tf-managed/live/terragrunt.hcl @@ -1,3 +1,6 @@ +# This is the root terragrunt file. It is used to define the remote state +# and the common inputs for all the services. + locals { # Parse the file path we're in to read the env name: e.g., env # will be "dev" in the dev folder, "stage" in the stage folder, @@ -16,8 +19,8 @@ remote_state { config = { // if the environment is dev, use the dev bucket, otherwise use the prod bucket bucket = (local.env == "prod" - ? "hubert-bucket-prod" - : "hubert-bucket-dev" + ? "forest-iac-bucket-prod" + : "forest-iac-bucket-dev" ) key = "${path_relative_to_include()}/terraform.tfstate" region = "eu-west-1" @@ -37,6 +40,9 @@ remote_state { # Common inputs for all the services. inputs = { + # The common resources dir contains common code that we want to share across all services. + # This is a legacy from the previous version of the infrastructure, and this will be removed + # in the future. common_resources_dir = format("%s/../common", get_parent_terragrunt_dir()) slack_channel = (local.env == "prod" ? "#forest-notifications" : "#forest-dump") environment = local.env From ba3d63a70f6225ad6f66e9c848ce4db9644f4af5 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Jan 2024 15:29:10 +0000 Subject: [PATCH 25/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tf-managed/live/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/tf-managed/live/README.md b/tf-managed/live/README.md index 33c2556ee..7762502c8 100644 --- a/tf-managed/live/README.md +++ b/tf-managed/live/README.md @@ -92,4 +92,3 @@ export TF_VAR_R2_SECRET_KEY= export TF_VAR_new_relic_api_key= export TF_VAR_new_relic_account_id= ``` - From 11951009a5e681c0eab1b9bb6916c1bdfcfd3412 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Mon, 15 Jan 2024 17:57:29 +0100 Subject: [PATCH 26/46] sync to main --- tf-managed/common/ruby_common/slack_client.rb | 2 +- .../daily-snapshot/service/upload_snapshot.sh | 11 ++++-- .../modules/sync-check/service/sync_check.rb | 39 ++++++++++--------- .../sync-check/service/sync_check_process.rb | 4 +- 4 files changed, 32 insertions(+), 24 deletions(-) diff --git a/tf-managed/common/ruby_common/slack_client.rb b/tf-managed/common/ruby_common/slack_client.rb index 0165d1b2e..a237f03d9 100644 --- a/tf-managed/common/ruby_common/slack_client.rb +++ b/tf-managed/common/ruby_common/slack_client.rb @@ -42,7 +42,7 @@ def attach_files(*files) # Attaches a file to the latest posted thread. def attach_file(file) - raise "No such file #{file}" unless File.exist? file + return unless File.exist? file raise 'Need to create a thread before attaching a file.' if @last_thread.nil? @client.files_upload( diff --git a/tf-managed/modules/daily-snapshot/service/upload_snapshot.sh b/tf-managed/modules/daily-snapshot/service/upload_snapshot.sh index 7cbd18dff..96429d002 100755 --- a/tf-managed/modules/daily-snapshot/service/upload_snapshot.sh +++ b/tf-managed/modules/daily-snapshot/service/upload_snapshot.sh @@ -22,7 +22,7 @@ set -eux # Install utility binaries that do not come with the image. # This assumes the container was started as a superuser. -apt-get update && apt-get install -y curl +apt-get update && apt-get install -y curl aria2 # Switch back to the service user for other service commands. su - forest @@ -64,8 +64,13 @@ echo "Chain: $CHAIN_NAME" forest-tool db destroy --force --config config.toml --chain "$CHAIN_NAME" -forest --config config.toml --chain "$CHAIN_NAME" --auto-download-snapshot --halt-after-import -forest --config config.toml --chain "$CHAIN_NAME" --no-gc --save-token=token.txt --detach +# Workaround for https://github.com/ChainSafe/forest/issues/3715 +# Normally, Forest should automatically download the latest snapshot. However, the performance +# of the download gets randomly bad, and the download times out. +aria2c -x5 https://forest-archive.chainsafe.dev/latest/$CHAIN_NAME/ +forest --config config.toml --chain "$CHAIN_NAME" --consume-snapshot *.car.zst --halt-after-import + +forest --config config.toml --chain "$CHAIN_NAME" --no-gc --save-token=token.txt --target-peer-count 500 --detach timeout "$SYNC_TIMEOUT" forest-cli sync wait forest-cli snapshot export -o forest_db/ forest-cli --token=\$(cat token.txt) shutdown --force diff --git a/tf-managed/modules/sync-check/service/sync_check.rb b/tf-managed/modules/sync-check/service/sync_check.rb index eb24ab777..3e939bde4 100755 --- a/tf-managed/modules/sync-check/service/sync_check.rb +++ b/tf-managed/modules/sync-check/service/sync_check.rb @@ -19,6 +19,7 @@ def get_and_assert_env_variable(name) CHANNEL = get_and_assert_env_variable 'FOREST_SLACK_NOTIF_CHANNEL' SCRIPTS_DIR = get_and_assert_env_variable 'SCRIPTS_DIR' LOG_DIR = get_and_assert_env_variable 'LOG_DIR' +TARGET_DATA = get_and_assert_env_variable 'FOREST_TARGET_DATA' hostname = ARGV[0] raise 'No arguments supplied. Please provide Forest hostname, e.g. forest-mainnet' if ARGV.empty? @@ -36,24 +37,26 @@ def get_and_assert_env_variable(name) logger = Logger.new(LOG_SYNC) -# Run the actual health check -logger.info 'Running the health check...' -health_check_passed = system("bash #{SCRIPTS_DIR}/health_check.sh #{hostname} > #{LOG_HEALTH} 2>&1") -logger.info 'Health check finished' - -# Save the log capture from the Forest container -container_logs = DockerUtils.get_container_logs hostname -File.write(LOG_FOREST, container_logs) - -client = SlackClient.new CHANNEL, SLACK_TOKEN - -if health_check_passed - client.post_message "✅ Sync check for #{hostname} passed. 🌲🌳🌲🌳🌲" -else - client.post_message "⛔ Sync check for #{hostname} fiascoed. 🔥🌲🔥" - SyncCheck.new.run_forest_tool("db destroy --chain #{network} --force") - logger.info 'DB Destroyed' +begin + # Run the actual health check + logger.info 'Running the health check...' + health_check_passed = system("bash #{SCRIPTS_DIR}/health_check.sh #{hostname} > #{LOG_HEALTH} 2>&1") + logger.info 'Health check finished' + + # Save the log capture from the Forest container + container_logs = DockerUtils.get_container_logs hostname + File.write(LOG_FOREST, container_logs) +ensure + client = SlackClient.new CHANNEL, SLACK_TOKEN + + if health_check_passed + client.post_message "✅ Sync check for #{hostname} passed. 🌲🌳🌲🌳🌲" + else + client.post_message "⛔ Sync check for #{hostname} fiascoed. 🔥🌲🔥" + FileUtils.rm_rf("#{TARGET_DATA}/#{network}") + logger.info 'DB Destroyed' + end + client.attach_files(LOG_HEALTH, LOG_SYNC, LOG_FOREST) end -client.attach_files(LOG_HEALTH, LOG_SYNC, LOG_FOREST) logger.info 'Sync check finished' diff --git a/tf-managed/modules/sync-check/service/sync_check_process.rb b/tf-managed/modules/sync-check/service/sync_check_process.rb index 48cfc25ac..9751d7c33 100755 --- a/tf-managed/modules/sync-check/service/sync_check_process.rb +++ b/tf-managed/modules/sync-check/service/sync_check_process.rb @@ -17,9 +17,9 @@ # Sync check class encompassing all required methods and fields class SyncCheck - def initialize + def initialize(slack_client = nil) @logger = Logger.new($stdout) - @client = SlackClient.new CHANNEL, SLACK_TOKEN + @client = slack_client || SlackClient.new(CHANNEL, SLACK_TOKEN) end # Runs a command with an arbitrary binary available in the chainsafe/forest image From 75e3f6ef8b505a902b4e7d6a234c6b303340fefe Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Mon, 15 Jan 2024 18:00:31 +0100 Subject: [PATCH 27/46] remove daily-snapshot, sync-check --- terraform/daily_snapshot/.terraform.lock.hcl | 69 ------- terraform/daily_snapshot/README.md | 59 ------ terraform/daily_snapshot/main.tf | 54 ------ terraform/daily_snapshot/prod/main.tf | 54 ------ terraform/daily_snapshot/prod/variable.tf | 35 ---- terraform/daily_snapshot/variable.tf | 23 --- terraform/modules/daily_snapshot/main.tf | 176 ------------------ .../modules/daily_snapshot/prep_sources.sh | 11 -- .../daily_snapshot/service/calibnet_cron_job | 7 - .../daily_snapshot/service/daily_snapshot.rb | 68 ------- .../daily_snapshot/service/forest-env.tpl | 11 -- .../modules/daily_snapshot/service/init.sh | 41 ---- .../daily_snapshot/service/mainnet_cron_job | 7 - .../service/newrelic_fail2ban.sh | 55 ------ .../service/upload_filops_snapshot.sh | 65 ------- .../daily_snapshot/service/upload_snapshot.sh | 119 ------------ terraform/modules/daily_snapshot/variable.tf | 111 ----------- terraform/modules/sync_check/main.tf | 131 ------------- terraform/modules/sync_check/prep_sources.sh | 14 -- .../sync_check/service/Dockerfile-tester | 13 -- terraform/modules/sync_check/service/Gemfile | 7 - .../modules/sync_check/service/Gemfile.lock | 41 ---- .../sync_check/service/docker-compose.yml | 118 ------------ .../modules/sync_check/service/forest-env.tpl | 9 - .../sync_check/service/health_check.sh | 112 ----------- terraform/modules/sync_check/service/init.sh | 66 ------- .../sync_check/service/restart.service | 9 - .../modules/sync_check/service/restart.sh | 7 - .../modules/sync_check/service/run_service.sh | 49 ----- .../modules/sync_check/service/sync_check.rb | 62 ------ .../sync_check/service/sync_check.toml | 4 - .../sync_check/service/sync_check_process.rb | 123 ------------ terraform/modules/sync_check/variable.tf | 64 ------- terraform/sync_check/.terraform.lock.hcl | 64 ------- terraform/sync_check/README.md | 86 --------- terraform/sync_check/main.tf | 49 ----- terraform/sync_check/variable.tf | 23 --- 37 files changed, 2016 deletions(-) delete mode 100644 terraform/daily_snapshot/.terraform.lock.hcl delete mode 100644 terraform/daily_snapshot/README.md delete mode 100644 terraform/daily_snapshot/main.tf delete mode 100644 terraform/daily_snapshot/prod/main.tf delete mode 100644 terraform/daily_snapshot/prod/variable.tf delete mode 100644 terraform/daily_snapshot/variable.tf delete mode 100644 terraform/modules/daily_snapshot/main.tf delete mode 100755 terraform/modules/daily_snapshot/prep_sources.sh delete mode 100755 terraform/modules/daily_snapshot/service/calibnet_cron_job delete mode 100644 terraform/modules/daily_snapshot/service/daily_snapshot.rb delete mode 100644 terraform/modules/daily_snapshot/service/forest-env.tpl delete mode 100755 terraform/modules/daily_snapshot/service/init.sh delete mode 100755 terraform/modules/daily_snapshot/service/mainnet_cron_job delete mode 100644 terraform/modules/daily_snapshot/service/newrelic_fail2ban.sh delete mode 100644 terraform/modules/daily_snapshot/service/upload_filops_snapshot.sh delete mode 100755 terraform/modules/daily_snapshot/service/upload_snapshot.sh delete mode 100644 terraform/modules/daily_snapshot/variable.tf delete mode 100644 terraform/modules/sync_check/main.tf delete mode 100755 terraform/modules/sync_check/prep_sources.sh delete mode 100644 terraform/modules/sync_check/service/Dockerfile-tester delete mode 100644 terraform/modules/sync_check/service/Gemfile delete mode 100644 terraform/modules/sync_check/service/Gemfile.lock delete mode 100644 terraform/modules/sync_check/service/docker-compose.yml delete mode 100644 terraform/modules/sync_check/service/forest-env.tpl delete mode 100755 terraform/modules/sync_check/service/health_check.sh delete mode 100755 terraform/modules/sync_check/service/init.sh delete mode 100644 terraform/modules/sync_check/service/restart.service delete mode 100755 terraform/modules/sync_check/service/restart.sh delete mode 100755 terraform/modules/sync_check/service/run_service.sh delete mode 100755 terraform/modules/sync_check/service/sync_check.rb delete mode 100644 terraform/modules/sync_check/service/sync_check.toml delete mode 100755 terraform/modules/sync_check/service/sync_check_process.rb delete mode 100644 terraform/modules/sync_check/variable.tf delete mode 100644 terraform/sync_check/.terraform.lock.hcl delete mode 100644 terraform/sync_check/README.md delete mode 100644 terraform/sync_check/main.tf delete mode 100644 terraform/sync_check/variable.tf diff --git a/terraform/daily_snapshot/.terraform.lock.hcl b/terraform/daily_snapshot/.terraform.lock.hcl deleted file mode 100644 index ae80807ad..000000000 --- a/terraform/daily_snapshot/.terraform.lock.hcl +++ /dev/null @@ -1,69 +0,0 @@ -# This file is maintained automatically by "terraform init". -# Manual edits may be lost in future updates. - -provider "registry.terraform.io/digitalocean/digitalocean" { - version = "2.29.0" - constraints = "~> 2.0" - hashes = [ - "h1:KSmD5RdWr/Go4Q5GlY9QsfSm1vtKxBJjJe3M5gaQXjg=", - "h1:OLSxMaqLOUl6DjQ3vz14odCyMCcLA63ltBNPgrIQHG4=", - "zh:0af0a1a2de818c5dc8ee7ad4dc4731452848e84cfa0c1ce514af1c7aad15c53c", - "zh:27229f3162b4142be48554f56227265982f3b74e4c79fa5d2528c8a3912d1e19", - "zh:31d6e73bfe12231fa0ab3bbeef0e4aa9822a2008ae2a1a8b22557bdada4af7a3", - "zh:6e7417413e96b87a11d47e9acbc88e6d707a6ab23a7de6b584fc600d9d3cbf00", - "zh:9faf40798a698b80e8d56e502c220856d2d5f55d5137b9cf5371f2fdaeadd70a", - "zh:b9ab9caf21b3f928fdd891e749fd8d33f6d441b39a08d725edf58cf8027a9b7b", - "zh:be32b3a35474f8acbab4d0ad8676810fa05a87918cc1874b53672159005016c0", - "zh:c2e8f7c08cad44b46e2e5580183e1ef2a4f1803347de136d1a35f333973a25f0", - "zh:cf0aba5b5042c762da489050716815652f809f3ef0ededb0f981f11691dbef03", - "zh:d1c0874c0ae0aa1eae86dbd131978796303599709c35b5dee926887d375f4cc8", - "zh:d4eecb61e763950a5a0f40cddc7a58345419a522b783aae7b0703309a354bb0c", - "zh:d866df86dd78eb2a9e54ebff637301522766710bb6dc7f8e330f1146822b62ee", - "zh:da51541ef96d0a5745740dc623bff3ccfb6b098b548d78cf5e9d95a15c69963a", - "zh:ede343be1528b468feae3a1cbf781e223f63ce33446a008a42f2fb799a23b436", - "zh:f20a60e2cecd29bbcc73d59e95aca368e2c55b7648f1923df2c0f7578026b048", - "zh:fccaf963f2db1e271e9d28276172910ca6b95471b8e0dfac758daf0495ce17f5", - ] -} - -provider "registry.terraform.io/hashicorp/external" { - version = "2.3.1" - constraints = "~> 2.1" - hashes = [ - "h1:9rJggijNdRdFk//ViQPGZdK0xu9XU/9qBDijNsZJMg0=", - "h1:bROCw6g5D/3fFnWeJ01L4IrdnJl1ILU8DGDgXCtYzaY=", - "zh:001e2886dc81fc98cf17cf34c0d53cb2dae1e869464792576e11b0f34ee92f54", - "zh:2eeac58dd75b1abdf91945ac4284c9ccb2bfb17fa9bdb5f5d408148ff553b3ee", - "zh:2fc39079ba61411a737df2908942e6970cb67ed2f4fb19090cd44ce2082903dd", - "zh:472a71c624952cff7aa98a7b967f6c7bb53153dbd2b8f356ceb286e6743bb4e2", - "zh:4cff06d31272aac8bc35e9b7faec42cf4554cbcbae1092eaab6ab7f643c215d9", - "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:7ed16ccd2049fa089616b98c0bd57219f407958f318f3c697843e2397ddf70df", - "zh:842696362c92bf2645eb85c739410fd51376be6c488733efae44f4ce688da50e", - "zh:8985129f2eccfd7f1841ce06f3bf2bbede6352ec9e9f926fbaa6b1a05313b326", - "zh:a5f0602d8ec991a5411ef42f872aa90f6347e93886ce67905c53cfea37278e05", - "zh:bf4ab82cbe5256dcef16949973bf6aa1a98c2c73a98d6a44ee7bc40809d002b8", - "zh:e70770be62aa70198fa899526d671643ff99eecf265bf1a50e798fc3480bd417", - ] -} - -provider "registry.terraform.io/hashicorp/local" { - version = "2.4.0" - constraints = "~> 2.1" - hashes = [ - "h1:Bs7LAkV/iQTLv72j+cTMrvx2U3KyXrcVHaGbdns1NcE=", - "h1:R97FTYETo88sT2VHfMgkPU3lzCsZLunPftjSI5vfKe8=", - "zh:53604cd29cb92538668fe09565c739358dc53ca56f9f11312b9d7de81e48fab9", - "zh:66a46e9c508716a1c98efbf793092f03d50049fa4a83cd6b2251e9a06aca2acf", - "zh:70a6f6a852dd83768d0778ce9817d81d4b3f073fab8fa570bff92dcb0824f732", - "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:82a803f2f484c8b766e2e9c32343e9c89b91997b9f8d2697f9f3837f62926b35", - "zh:9708a4e40d6cc4b8afd1352e5186e6e1502f6ae599867c120967aebe9d90ed04", - "zh:973f65ce0d67c585f4ec250c1e634c9b22d9c4288b484ee2a871d7fa1e317406", - "zh:c8fa0f98f9316e4cfef082aa9b785ba16e36ff754d6aba8b456dab9500e671c6", - "zh:cfa5342a5f5188b20db246c73ac823918c189468e1382cb3c48a9c0c08fc5bf7", - "zh:e0e2b477c7e899c63b06b38cd8684a893d834d6d0b5e9b033cedc06dd7ffe9e2", - "zh:f62d7d05ea1ee566f732505200ab38d94315a4add27947a60afa29860822d3fc", - "zh:fa7ce69dde358e172bd719014ad637634bbdabc49363104f4fca759b4b73f2ce", - ] -} diff --git a/terraform/daily_snapshot/README.md b/terraform/daily_snapshot/README.md deleted file mode 100644 index 15dafcf89..000000000 --- a/terraform/daily_snapshot/README.md +++ /dev/null @@ -1,59 +0,0 @@ -# Overview - -This directory contains an infrastructure configuration for the Forest service, -which generates the latest snapshots available at this endpoint: -https://forest-archive.chainsafe.dev/latest/mainnet/ and -https://forest-archive.chainsafe.dev/latest/calibnet/ - -# Workflow - -Changing any of the settings (such as the size of the droplet or the operating -system) will automatically re-deploy the service. The same is true for changing -any of the scripts. - -To propose new changes, start by opening a PR. This will trigger a new -deployment plan to be pasted in the PR comments. Once the PR is merged, the -deployment plan is executed. - -The workflow has access to all the required secrets (DO token, slack token, S3 -credentials, etc) and none of them have to be provided when creating a new PR. -However, the deployment workflow is not triggered automatically if you change -the secrets. In this case, you have to trigger the workflow manually. - -# Manual deployments - -To manually deploy the service (useful for testing and debugging), you first -need to set the following environment variables (you will be prompted later if -you don't set these variables): - -## Required environment variables - -```bash -# DigitalOcean personal access token: https://cloud.digitalocean.com/account/api/tokens -export TF_VAR_do_token= -# Slack access token: https://api.slack.com/apps -export TF_VAR_slack_token= -# S3 access keys used by terraform. Can be generated here: https://cloud.digitalocean.com/account/api/spaces -export AWS_ACCESS_KEY_ID= -export AWS_SECRET_ACCESS_KEY= - -# Cloudflare R2 secret access keys used by the snapshot service. -export TF_VAR_R2_ACCESS_KEY= -export TF_VAR_R2_SECRET_KEY= -``` - -Forest tokens can be found on 1password. - -Playbook: - -```bash -$ terraform init # Initialize terraform state -$ terraform plan # Show deployment plan (optional) -$ terraform apply # Apply deployment plan -$ terraform destroy # Destroy deployment -``` - -For Mac users, if you encounter the `Error: External Program Execution Failed`, you'll need to adjust the `prep_sources.sh` file located in the `../modules/daily_snapshot` directory. Make the following changes: - -- Replace `--archive` with `-Rp`. -- Install `gnu-tar` using the command `brew install gnu-tar`. Afterward, switch `tar cf ../sources.tar` to `gtar cf ../sources.tar` diff --git a/terraform/daily_snapshot/main.tf b/terraform/daily_snapshot/main.tf deleted file mode 100644 index 2950c72ca..000000000 --- a/terraform/daily_snapshot/main.tf +++ /dev/null @@ -1,54 +0,0 @@ -terraform { - required_version = "~> 1.3" - - backend "s3" { - # Note: This is the bucket for the internal terraform state. This bucket is - # completely independent from the bucket that contains snapshots. - bucket = "forest-iac" - # This key uniquely identifies the service. To create a new service (instead - # of modifying this one), use a new key. Unfortunately, variables may not be - # used here. - key = "daily_snapshot_dev.tfstate" - - # This value is completely unused by DO but _must_ be a known AWS region. - region = "us-west-1" - # The S3 region is determined by the endpoint. fra1 = Frankfurt. - # This region does not have to be shared by the droplet. - endpoints = { - s3 = "https://fra1.digitaloceanspaces.com" - } - # Credentially can be validated through the Security Token Service (STS). - # Unfortunately, DigitalOcean does not support STS so we have to skip the - # validation. - skip_credentials_validation = "true" - skip_requesting_account_id = "true" - skip_s3_checksum = "true" - } -} - -module "daily_snapshot" { - # Import the daily_snapshot module - source = "../modules/daily_snapshot" - - # Configure service: - name = "forest-snapshot-dev" # droplet name - size = "c2-8vcpu-16gb" # droplet size - slack_channel = "#forest-dump" # slack channel for notifications - snapshot_bucket = "forest-archive-dev" - snapshot_endpoint = "fra1.digitaloceanspaces.com" - forest_tag = "latest" - r2_endpoint = "https://2238a825c5aca59233eab1f221f7aefb.r2.cloudflarestorage.com/" - - # Variable passthrough: - slack_token = var.slack_token - R2_ACCESS_KEY = var.R2_ACCESS_KEY - R2_SECRET_KEY = var.R2_SECRET_KEY - digitalocean_token = var.do_token - NEW_RELIC_API_KEY = "" - NEW_RELIC_ACCOUNT_ID = "" -} - -# This ip address may be used in the future by monitoring software -output "ip" { - value = [module.daily_snapshot.ip] -} diff --git a/terraform/daily_snapshot/prod/main.tf b/terraform/daily_snapshot/prod/main.tf deleted file mode 100644 index e5bd52849..000000000 --- a/terraform/daily_snapshot/prod/main.tf +++ /dev/null @@ -1,54 +0,0 @@ -terraform { - required_version = "~> 1.3" - - backend "s3" { - # Note: This is the bucket for the internal terraform state. This bucket is - # completely independent from the bucket that contains snapshots. - bucket = "forest-iac" - # This key uniquely identifies the service. To create a new service (instead - # of modifying this one), use a new key. Unfortunately, variables may not be - # used here. - key = "daily_snapshot.tfstate" - - # This value is completely unused by DO but _must_ be a known AWS region. - region = "us-west-1" - # The S3 region is determined by the endpoint. fra1 = Frankfurt. - # This region does not have to be shared by the droplet. - endpoints = { - s3 = "https://fra1.digitaloceanspaces.com" - } - - # Credentially can be validated through the Security Token Service (STS). - # Unfortunately, DigitalOcean does not support STS so we have to skip the - # validation. - skip_credentials_validation = "true" - skip_requesting_account_id = "true" - skip_s3_checksum = "true" - } -} - -module "daily_snapshot" { - # Import the daily_snapshot module - source = "../../modules/daily_snapshot" - - # Configure service: - name = "forest-snapshot" # droplet name - size = "s-4vcpu-16gb-amd" # droplet size - slack_channel = "#forest-notifications" # slack channel for notifications - snapshot_bucket = "forest-archive" - forest_tag = "v0.16.4" - r2_endpoint = "https://2238a825c5aca59233eab1f221f7aefb.r2.cloudflarestorage.com/" - - # Variable passthrough: - slack_token = var.slack_token - R2_ACCESS_KEY = var.R2_ACCESS_KEY - R2_SECRET_KEY = var.R2_SECRET_KEY - digitalocean_token = var.do_token - NEW_RELIC_API_KEY = var.NEW_RELIC_API_KEY - NEW_RELIC_ACCOUNT_ID = var.NEW_RELIC_ACCOUNT_ID -} - -# This ip address may be used in the future by monitoring software -output "ip" { - value = [module.daily_snapshot.ip] -} diff --git a/terraform/daily_snapshot/prod/variable.tf b/terraform/daily_snapshot/prod/variable.tf deleted file mode 100644 index dceec8b88..000000000 --- a/terraform/daily_snapshot/prod/variable.tf +++ /dev/null @@ -1,35 +0,0 @@ -variable "do_token" { - description = "Token for authentication." - type = string - sensitive = true -} - -variable "R2_ACCESS_KEY" { - description = "S3 access key id" - type = string - sensitive = true -} - -variable "R2_SECRET_KEY" { - description = "S3 private access key" - type = string - sensitive = true -} - -variable "slack_token" { - description = "slack access token" - type = string - sensitive = true -} - -variable "NEW_RELIC_API_KEY" { - description = "New Relic API KEY" - type = string - sensitive = true -} - -variable "NEW_RELIC_ACCOUNT_ID" { - description = "The New Relic Account ID" - type = string - sensitive = true -} diff --git a/terraform/daily_snapshot/variable.tf b/terraform/daily_snapshot/variable.tf deleted file mode 100644 index c0f3ddce3..000000000 --- a/terraform/daily_snapshot/variable.tf +++ /dev/null @@ -1,23 +0,0 @@ -variable "do_token" { - description = "Token for authentication." - type = string - sensitive = true -} - -variable "R2_ACCESS_KEY" { - description = "S3 access key id" - type = string - sensitive = true -} - -variable "R2_SECRET_KEY" { - description = "S3 private access key" - type = string - sensitive = true -} - -variable "slack_token" { - description = "slack access token" - type = string - sensitive = true -} diff --git a/terraform/modules/daily_snapshot/main.tf b/terraform/modules/daily_snapshot/main.tf deleted file mode 100644 index 5896116d4..000000000 --- a/terraform/modules/daily_snapshot/main.tf +++ /dev/null @@ -1,176 +0,0 @@ -# This terraform script executes the following steps: -# - Zip the ruby and shell script files (the hash of this zip file is used to -# determine when to re-deploy the service) -# - Boot a new droplet -# - Copy over the zip file -# - Run the init.sh script in the background - -terraform { - required_version = "~> 1.3" - - required_providers { - digitalocean = { - source = "digitalocean/digitalocean" - version = "~> 2.0" - } - external = { - source = "hashicorp/external" - version = "~> 2.1" - } - local = { - source = "hashicorp/local" - version = "~> 2.1" - } - - } -} - -provider "digitalocean" { - token = var.digitalocean_token -} - -// Ugly hack because 'archive_file' cannot mix files and folders. -data "external" "sources_tar" { - program = ["sh", "${path.module}/prep_sources.sh", path.module] -} - - -data "local_file" "sources" { - filename = data.external.sources_tar.result.path -} - -// Note: The init.sh file is also included in the sources.zip such that the hash -// of the archive captures the entire state of the machine. -// This is a workaround, and because of this, we need to suppress the tflint warning here -// for unused declarations related to the 'init.sh' file. -// tflint-ignore: terraform_unused_declarations -data "local_file" "init" { - filename = "${path.module}/service/init.sh" -} - -data "digitalocean_ssh_keys" "keys" { - sort { - key = "name" - direction = "asc" - } -} - -# Set required environment variables -locals { - env_content = templatefile("${path.module}/service/forest-env.tpl", { - R2_ACCESS_KEY = var.R2_ACCESS_KEY, - R2_SECRET_KEY = var.R2_SECRET_KEY, - r2_endpoint = var.r2_endpoint, - slack_token = var.slack_token, - slack_channel = var.slack_channel, - snapshot_bucket = var.snapshot_bucket, - snapshot_endpoint = var.snapshot_endpoint, - NEW_RELIC_API_KEY = var.NEW_RELIC_API_KEY, - NEW_RELIC_ACCOUNT_ID = var.NEW_RELIC_ACCOUNT_ID, - NEW_RELIC_REGION = var.NEW_RELIC_REGION, - BASE_FOLDER = "/root", - forest_tag = var.forest_tag - }) -} - -locals { - init_commands = ["cd /root/", - "tar xf sources.tar", - # Set required environment variables - "echo '${local.env_content}' >> /root/.forest_env", - "echo '. ~/.forest_env' >> .bashrc", - ". ~/.forest_env", - "nohup sh ./init.sh > init_log.txt &", - # Exiting without a sleep sometimes kills the script :-/ - "sleep 60s" - ] -} - -resource "digitalocean_droplet" "forest" { - image = var.image - name = var.name - region = var.region - size = var.size - # Re-initialize resource if this hash changes: - user_data = join("-", [data.local_file.sources.content_sha256, sha256(join("", local.init_commands))]) - tags = ["iac"] - ssh_keys = data.digitalocean_ssh_keys.keys.ssh_keys[*].fingerprint - monitoring = true - - graceful_shutdown = false - - connection { - host = self.ipv4_address - user = "root" - type = "ssh" - } - - # Push the sources.tar file to the newly booted droplet - provisioner "file" { - source = data.local_file.sources.filename - destination = "/root/sources.tar" - } - - provisioner "remote-exec" { - inline = local.init_commands - } -} - - -data "digitalocean_project" "forest_project" { - name = var.project -} - -# Connect the droplet to the forest project (otherwise it ends up in -# "ChainBridge" which is the default project) -resource "digitalocean_project_resources" "connect_forest_project" { - project = data.digitalocean_project.forest_project.id - resources = [digitalocean_droplet.forest.urn] -} - -resource "digitalocean_firewall" "forest-firewall" { - name = var.name - - inbound_rule { - protocol = "tcp" - port_range = "22" - source_addresses = var.source_addresses - } - - inbound_rule { - protocol = "tcp" - port_range = "2345" - source_addresses = var.source_addresses - } - - inbound_rule { - protocol = "tcp" - port_range = "80" - source_addresses = var.source_addresses - } - - inbound_rule { - protocol = "udp" - port_range = "53" - source_addresses = var.source_addresses - } - - outbound_rule { - protocol = "tcp" - port_range = "all" - destination_addresses = var.destination_addresses - } - - outbound_rule { - protocol = "udp" - port_range = "53" - destination_addresses = var.destination_addresses - } - - droplet_ids = [digitalocean_droplet.forest.id] -} - -# This ip address may be used in the future by monitoring software -output "ip" { - value = [digitalocean_droplet.forest.ipv4_address] -} diff --git a/terraform/modules/daily_snapshot/prep_sources.sh b/terraform/modules/daily_snapshot/prep_sources.sh deleted file mode 100755 index 075998cb7..000000000 --- a/terraform/modules/daily_snapshot/prep_sources.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -# Copy local source files in a folder together with ruby_common and create a zip archive. - -cd "$1" || exit -cp --archive ../../../scripts/ruby_common service/ || exit - -rm -f sources.tar -(cd service && tar cf ../sources.tar --sort=name --mtime='UTC 2019-01-01' ./* > /dev/null 2>&1) || exit -rm -fr service/ruby_common -echo "{ \"path\": \"$1/sources.tar\" }" diff --git a/terraform/modules/daily_snapshot/service/calibnet_cron_job b/terraform/modules/daily_snapshot/service/calibnet_cron_job deleted file mode 100755 index a492ad45d..000000000 --- a/terraform/modules/daily_snapshot/service/calibnet_cron_job +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -# shellcheck source=/dev/null -source ~/.forest_env -cd "$BASE_FOLDER" || exit -flock -n /tmp/calibnet.lock -c "ruby daily_snapshot.rb calibnet > logs/calibnet_log.txt 2>&1" -flock -n /tmp/calibnet_filops.lock -c "./upload_filops_snapshot.sh calibnet > logs/filops_calibnet_log.txt 2>&1" diff --git a/terraform/modules/daily_snapshot/service/daily_snapshot.rb b/terraform/modules/daily_snapshot/service/daily_snapshot.rb deleted file mode 100644 index 4bdc3d2d2..000000000 --- a/terraform/modules/daily_snapshot/service/daily_snapshot.rb +++ /dev/null @@ -1,68 +0,0 @@ -# frozen_string_literal: true - -require_relative 'ruby_common/slack_client' -require_relative 'ruby_common/docker_utils' -require_relative 'ruby_common/utils' - -require 'date' -require 'logger' -require 'fileutils' -require 'active_support/time' - -BASE_FOLDER = get_and_assert_env_variable 'BASE_FOLDER' -SLACK_TOKEN = get_and_assert_env_variable 'SLACK_API_TOKEN' -CHANNEL = get_and_assert_env_variable 'SLACK_NOTIF_CHANNEL' - -# Query the date of the most recent snapshot. -def latest_snapshot_date(chain_name = 'calibnet') - # We do not support HEAD requests but we _do_ support empty ranges. - filename = `curl --remote-name --remote-header-name --location --write-out "%{filename_effective}" --silent https://forest-archive.chainsafe.dev/latest/#{chain_name}/ -H "Range: bytes=0-0"` - # Curl will create a file with a single byte in it. Let's clean it up. - File.delete(filename) - snapshot_format = /^([^_]+?)_snapshot_(?[^_]+?)_(?\d{4}-\d{2}-\d{2})_height_(?\d+)(\.forest)?\.car.zst$/ - filename.match(snapshot_format) do |m| - m[:date].to_date - end -end - -CHAIN_NAME = ARGV[0] -raise 'No chain name supplied. Please provide chain identifier, e.g. calibnet or mainnet' if ARGV.empty? - -# Current datetime, to append to the log files -DATE = Time.new.strftime '%FT%H:%M:%S' -LOG_EXPORT_SCRIPT_RUN = "logs/#{CHAIN_NAME}_#{DATE}_script_run.txt" -LOG_EXPORT_DAEMON = "logs/#{CHAIN_NAME}_#{DATE}_daemon.txt" -LOG_EXPORT_METRICS = "logs/#{CHAIN_NAME}_#{DATE}_metrics.txt" - -client = SlackClient.new CHANNEL, SLACK_TOKEN - -# Query the date of the most recent snapshot. This is used to limit the number -# of victory messages to 1/day even if we upload multiple snapshots per day. -date_before_export = latest_snapshot_date(CHAIN_NAME) - -# conditionally add timestamps to logs without timestamps -add_timestamps_cmd = %q[awk '{ if ($0 !~ /^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{6}Z/) print strftime("[%Y-%m-%d %H:%M:%S]"), $0; else print $0; fflush(); }'] -upload_cmd = "set -o pipefail && \ -timeout --signal=KILL 8h ./upload_snapshot.sh #{CHAIN_NAME} #{LOG_EXPORT_DAEMON} #{LOG_EXPORT_METRICS} | #{add_timestamps_cmd}" - -# The command needs to be run indirectly to avoid syntax errors in the shell. -snapshot_uploaded = system('bash', '-c', upload_cmd, %i[out err] => LOG_EXPORT_SCRIPT_RUN) - -if snapshot_uploaded - date_after_export = latest_snapshot_date(CHAIN_NAME) - - # If this is the first new snapshot of the day, send a victory message to slack - unless date_before_export == date_after_export - client.post_message "✅ Snapshot uploaded for #{CHAIN_NAME}. 🌲🌳🌲🌳🌲" - end -else - client.post_message "⛔ Snapshot failed for #{CHAIN_NAME}. 🔥🌲🔥 " - # attach the log file and print the contents to STDOUT - [LOG_EXPORT_SCRIPT_RUN, LOG_EXPORT_DAEMON, LOG_EXPORT_METRICS].each do |log_file| - client.attach_files(log_file) if File.exist?(log_file) - end -end - -[LOG_EXPORT_SCRIPT_RUN, LOG_EXPORT_DAEMON, LOG_EXPORT_METRICS].each do |log_file| - puts "Snapshot export log:\n#{File.read(log_file)}\n\n" if File.exist?(log_file) -end diff --git a/terraform/modules/daily_snapshot/service/forest-env.tpl b/terraform/modules/daily_snapshot/service/forest-env.tpl deleted file mode 100644 index 14f644c7a..000000000 --- a/terraform/modules/daily_snapshot/service/forest-env.tpl +++ /dev/null @@ -1,11 +0,0 @@ -export R2_ACCESS_KEY="${R2_ACCESS_KEY}" -export R2_SECRET_KEY="${R2_SECRET_KEY}" -export R2_ENDPOINT="${r2_endpoint}" -export SLACK_API_TOKEN="${slack_token}" -export SLACK_NOTIF_CHANNEL="${slack_channel}" -export SNAPSHOT_BUCKET="${snapshot_bucket}" -export NEW_RELIC_API_KEY="${NEW_RELIC_API_KEY}" -export NEW_RELIC_ACCOUNT_ID="${NEW_RELIC_ACCOUNT_ID}" -export NEW_RELIC_REGION="${NEW_RELIC_REGION}" -export BASE_FOLDER="${BASE_FOLDER}" -export FOREST_TAG="${forest_tag}" diff --git a/terraform/modules/daily_snapshot/service/init.sh b/terraform/modules/daily_snapshot/service/init.sh deleted file mode 100755 index 2ad8d923a..000000000 --- a/terraform/modules/daily_snapshot/service/init.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash - -set -eux - -# Wait for cloud-init to finish initializing the machine -cloud-init status --wait - -# Setting DEBIAN_FRONTEND to ensure non-interactive operations for APT -export DEBIAN_FRONTEND=noninteractive - -# Using timeout to ensure the script retries if the APT servers are temporarily unavailable. -timeout 10m bash -c 'until apt-get -qqq --yes update && \ - apt-get -qqq --yes install ruby ruby-dev anacron awscli; do sleep 10; \ -done' - -# Install the gems -gem install docker-api slack-ruby-client -gem install activesupport -v 7.0.8 - -# 1. Configure aws -# 2. Create forest_db directory -# 3. Copy scripts to /etc/cron.hourly - -## Configure aws -aws configure set default.s3.multipart_chunksize 4GB -aws configure set aws_access_key_id "$R2_ACCESS_KEY" -aws configure set aws_secret_access_key "$R2_SECRET_KEY" - -## Create forest data directory -mkdir forest_db logs -chmod 777 forest_db logs -mkdir --parents -- "$BASE_FOLDER/forest_db/filops" - -# Make the scripts executable -chmod +x ./upload_filops_snapshot.sh - -# Run new_relic and fail2ban scripts -bash newrelic_fail2ban.sh & - -# Setup cron jobs -cp calibnet_cron_job mainnet_cron_job /etc/cron.hourly/ diff --git a/terraform/modules/daily_snapshot/service/mainnet_cron_job b/terraform/modules/daily_snapshot/service/mainnet_cron_job deleted file mode 100755 index 24eb56170..000000000 --- a/terraform/modules/daily_snapshot/service/mainnet_cron_job +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -# shellcheck source=/dev/null -source ~/.forest_env -cd "$BASE_FOLDER" || exit -flock -n /tmp/mainnet.lock -c "ruby daily_snapshot.rb mainnet > mainnet_log.txt 2>&1" || exit -flock -n /tmp/mainnet_filops.lock -c "./upload_filops_snapshot.sh mainnet > filops_mainnet_log.txt 2>&1" || exit diff --git a/terraform/modules/daily_snapshot/service/newrelic_fail2ban.sh b/terraform/modules/daily_snapshot/service/newrelic_fail2ban.sh deleted file mode 100644 index 7e608ff65..000000000 --- a/terraform/modules/daily_snapshot/service/newrelic_fail2ban.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -# This script configures New Relic infrastructure monitoring and Fail2Ban. -# It sets up the New Relic license key and custom configuration, adds the New Relic repository, -# refreshes it, and installs the New Relic infrastructure agent. -# It also installs Fail2Ban, sets up its default configuration, and enables it to start at boot - -set -euo pipefail -# If new relic API key is provided, install the new relic agent -if [ -n "$NEW_RELIC_API_KEY" ] ; then - curl -Ls https://download.newrelic.com/install/newrelic-cli/scripts/install.sh | bash && \ - sudo NEW_RELIC_API_KEY="$NEW_RELIC_API_KEY" \ - NEW_RELIC_ACCOUNT_ID="$NEW_RELIC_ACCOUNT_ID" \ - NEW_RELIC_REGION="$NEW_RELIC_REGION" \ - /usr/local/bin/newrelic install -y - -# The provided configurations are specific to New Relic. To gain a deeper understanding of these configuration details, you can visit: -# https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent/configuration/infrastructure-agent-configuration-settings/#offline-time-to-reset -cat >> /etc/newrelic-infra.yml < /etc/newrelic-infra/logging.d/logging.yml <> "$LOG_EXPORT_METRICS" - sleep 15 - done -} - -function print_forest_logs { - cat forest.err forest.out > $LOG_EXPORT_DAEMON -} -trap print_forest_logs EXIT - -echo "[client]" > config.toml -echo 'data_dir = "/home/forest/forest_db"' >> config.toml -echo 'encrypt_keystore = false' >> config.toml - -echo "Chain: $CHAIN_NAME" - -# spawn a task in the background to periodically write Prometheus metrics to a file -( - set +x # Disable debugging for this subshell to keep the logs clean. - write_metrics -) & - -forest-tool db destroy --force --config config.toml --chain "$CHAIN_NAME" - -# Workaround for https://github.com/ChainSafe/forest/issues/3715 -# Normally, Forest should automatically download the latest snapshot. However, the performance -# of the download gets randomly bad, and the download times out. -aria2c -x5 https://forest-archive.chainsafe.dev/latest/$CHAIN_NAME/ -forest --config config.toml --chain "$CHAIN_NAME" --consume-snapshot *.car.zst --halt-after-import - -forest --config config.toml --chain "$CHAIN_NAME" --no-gc --save-token=token.txt --target-peer-count 500 --detach -timeout "$SYNC_TIMEOUT" forest-cli sync wait -forest-cli snapshot export -o forest_db/ -forest-cli --token=\$(cat token.txt) shutdown --force - -# Run full checks only for calibnet, given that it takes too long for mainnet. -if [ "$CHAIN_NAME" = "calibnet" ]; then - forest-tool snapshot validate --check-network "$CHAIN_NAME" forest_db/forest_snapshot_*.forest.car.zst -else - forest-tool archive info forest_db/forest_snapshot_*.forest.car.zst - forest-tool snapshot validate --check-links 0 --check-network "$CHAIN_NAME" --check-stateroots 5 forest_db/forest_snapshot_*.forest.car.zst -fi - - -# Kill the metrics writer process -kill %1 - -HEREDOC -) - -# Stop any lingering docker containers -CONTAINER_NAME="forest-snapshot-upload-node-$CHAIN_NAME" -docker stop "$CONTAINER_NAME" || true -docker rm --force "$CONTAINER_NAME" - -CHAIN_DB_DIR="$BASE_FOLDER/forest_db/$CHAIN_NAME" -CHAIN_LOGS_DIR="$BASE_FOLDER/logs" - -# Delete any existing snapshot files. It may be that the previous run failed -# before deleting those. -rm "$CHAIN_DB_DIR/forest_snapshot_$CHAIN_NAME"* - -# Run forest and generate a snapshot in forest_db/ -docker run \ - --name "$CONTAINER_NAME" \ - --rm \ - --user root \ - -v "$CHAIN_DB_DIR:/home/forest/forest_db":z \ - -v "$CHAIN_LOGS_DIR:/home/forest/logs":z \ - --entrypoint /bin/bash \ - ghcr.io/chainsafe/forest:"${FOREST_TAG}" \ - -c "$COMMANDS" || exit 1 - -aws --endpoint "$R2_ENDPOINT" s3 cp --no-progress "$CHAIN_DB_DIR/forest_snapshot_$CHAIN_NAME"*.forest.car.zst s3://forest-archive/"$CHAIN_NAME"/latest/ || exit 1 - -# Delete snapshot files -rm "$CHAIN_DB_DIR/forest_snapshot_$CHAIN_NAME"* diff --git a/terraform/modules/daily_snapshot/variable.tf b/terraform/modules/daily_snapshot/variable.tf deleted file mode 100644 index 6ccfe9807..000000000 --- a/terraform/modules/daily_snapshot/variable.tf +++ /dev/null @@ -1,111 +0,0 @@ -variable "digitalocean_token" { - description = "Token for authentication." - type = string - sensitive = true -} - -variable "name" { - description = "The name of Forest Droplet" - type = string -} - -variable "size" { - description = "The size of the droplet instance to launch" - type = string -} - -variable "slack_channel" { - description = "slack channel name for notifications" - type = string -} - -variable "slack_token" { - description = "slack access token" - type = string - sensitive = true -} - -variable "R2_ACCESS_KEY" { - description = "S3 access key id" - type = string - sensitive = true -} - -variable "R2_SECRET_KEY" { - description = "S3 private access key" - type = string - sensitive = true -} - -variable "snapshot_bucket" { - description = "S3 bucket containing the snapshots" - type = string - default = "forest-snapshots" -} - -variable "r2_endpoint" { - description = "R2 endpoint for the snapshots" - type = string -} - -variable "snapshot_endpoint" { - description = "S3 endpoint for the snapshots" - type = string - default = "https://fra1.digitaloceanspaces.com/" -} - -variable "forest_tag" { - description = "Image tag for the Forest container" - type = string - default = "latest" -} - -variable "image" { - description = "The ID of the AMI to use for the Droplet" - type = string - default = "docker-20-04" -} - -variable "region" { - description = "The region where resources will be created" - type = string - default = "fra1" -} - -variable "project" { - description = "DigitalOcean project used as parent for the created droplet" - type = string - default = "Forest-DEV" # Alternative: "Default" -} - -variable "source_addresses" { - description = "List of source addresses." - type = list(string) - default = ["0.0.0.0/0", "::/0"] -} - -variable "destination_addresses" { - description = "List of destination addresses." - type = list(string) - default = ["0.0.0.0/0", "::/0"] -} - -variable "NEW_RELIC_REGION" { - description = "The New Relic Platform Region" - type = string - default = "EU" -} - -variable "NEW_RELIC_API_KEY" { - description = "New Relic API KEY" - default = "" - type = string - sensitive = true -} - -variable "NEW_RELIC_ACCOUNT_ID" { - description = "The New Relic Account ID" - default = "" - type = string - sensitive = true -} diff --git a/terraform/modules/sync_check/main.tf b/terraform/modules/sync_check/main.tf deleted file mode 100644 index b5755010a..000000000 --- a/terraform/modules/sync_check/main.tf +++ /dev/null @@ -1,131 +0,0 @@ -# This terraform script executes the following steps: -# - Zip the ruby and shell script files (the hash of this zip file is used to -# determine when to re-deploy the service) -# - Boot a new droplet -# - Copy over the zip file -# - Run calibnet and mainnet sync check in the background - -terraform { - required_version = "~> 1.3" - - required_providers { - digitalocean = { - source = "digitalocean/digitalocean" - version = "~> 2.0" - } - external = { - source = "hashicorp/external" - version = "~> 2.1" - } - local = { - source = "hashicorp/local" - version = "~> 2.1" - } - - } -} - -provider "digitalocean" { - token = var.digitalocean_token -} - -// Ugly hack because 'archive_file' cannot mix files and folders. -data "external" "sources_tar" { - program = ["sh", "${path.module}/prep_sources.sh", path.module] -} - -data "local_file" "sources" { - filename = data.external.sources_tar.result.path -} - -// Note: The init.sh file is also included in the sources.zip such that the hash -// of the archive captures the entire state of the machine. -// This is a workaround, and because of this, we need to suppress the tflint warning here -// for unused declarations related to the 'init.sh' file. -// tflint-ignore: terraform_unused_declarations -data "local_file" "init" { - filename = "${path.module}/service/init.sh" -} - -data "digitalocean_ssh_keys" "keys" { - sort { - key = "name" - direction = "asc" - } -} - -# Set required environment variables -locals { - env_content = templatefile("${path.module}/service/forest-env.tpl", { - FOREST_TARGET_DATA = "/volumes/forest_data", - FOREST_TARGET_SCRIPTS = "/volumes/sync_check", - FOREST_TARGET_RUBY_COMMON = "/volumes/ruby_common", - slack_token = var.slack_token, - slack_channel = var.slack_channel, - NEW_RELIC_API_KEY = var.NEW_RELIC_API_KEY, - NEW_RELIC_ACCOUNT_ID = var.NEW_RELIC_ACCOUNT_ID, - NEW_RELIC_REGION = var.NEW_RELIC_REGION, - forest_tag = "edge" - }) -} - -locals { - init_commands = [ - "tar xf sources.tar", - # Set required environment variables - "echo '${local.env_content}' >> /root/.forest_env", - "echo '. ~/.forest_env' >> .bashrc", - ". ~/.forest_env", - "nohup sh ./init.sh > init_log.txt &", - "cp ./restart.service /etc/systemd/system/", - "systemctl enable restart.service", - # Exiting without a sleep sometimes kills the script :-/ - "sleep 60s", - ] -} - -resource "digitalocean_droplet" "forest" { - image = var.image - name = var.name - region = var.region - size = var.size - # Re-initialize resource if this hash changes: - user_data = join("-", [data.local_file.sources.content_sha256, sha256(join("", local.init_commands))]) - tags = ["iac"] - ssh_keys = data.digitalocean_ssh_keys.keys.ssh_keys[*].fingerprint - monitoring = true - - graceful_shutdown = false - - connection { - host = self.ipv4_address - user = "root" - type = "ssh" - } - - # Push the sources.tar file to the newly booted droplet - provisioner "file" { - source = data.local_file.sources.filename - destination = "/root/sources.tar" - } - - provisioner "remote-exec" { - inline = local.init_commands - } -} - -data "digitalocean_project" "forest_project" { - name = var.project -} - -# Connect the droplet to the forest project (otherwise it ends up in -# "ChainBridge" which is the default project) -resource "digitalocean_project_resources" "connect_forest_project" { - project = data.digitalocean_project.forest_project.id - resources = [digitalocean_droplet.forest.urn] -} - -# This ip address may be used in the future by monitoring software -output "ip" { - value = [digitalocean_droplet.forest.ipv4_address] -} diff --git a/terraform/modules/sync_check/prep_sources.sh b/terraform/modules/sync_check/prep_sources.sh deleted file mode 100755 index 50bd009d6..000000000 --- a/terraform/modules/sync_check/prep_sources.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# Enable strict error handling and command tracing -set -ex - -# Copy local source files in a folder together with ruby_common and create a zip archive. - -cd "$1" || exit -cp --archive ../../../scripts/ruby_common service/ || exit - -rm -f sources.tar -(cd service && tar cf ../sources.tar --sort=name --mtime='UTC 2019-01-01' ./* > /dev/null 2>&1) || exit -rm -fr service/ruby_common -echo "{ \"path\": \"$1/sources.tar\" }" diff --git a/terraform/modules/sync_check/service/Dockerfile-tester b/terraform/modules/sync_check/service/Dockerfile-tester deleted file mode 100644 index ed8b1aa89..000000000 --- a/terraform/modules/sync_check/service/Dockerfile-tester +++ /dev/null @@ -1,13 +0,0 @@ -FROM ubuntu:22.04 - -RUN apt-get update && \ - apt-get install --no-install-recommends -y docker ruby make gcc build-essential ruby-dev curl \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /chainsafe - -COPY Gemfile Gemfile.lock health_check.sh sync_check_process.rb sync_check.rb sync_check.toml ./ -COPY ruby_common ruby_common - -RUN gem install bundler && bundle install diff --git a/terraform/modules/sync_check/service/Gemfile b/terraform/modules/sync_check/service/Gemfile deleted file mode 100644 index e1a861576..000000000 --- a/terraform/modules/sync_check/service/Gemfile +++ /dev/null @@ -1,7 +0,0 @@ -# frozen_string_literal: true - -source 'https://rubygems.org' - -gem 'docker-api', '>= 2.2.0' -gem 'slack-ruby-client', '>= 2.1.0' -gem 'sys-filesystem', '>=1.4.3' diff --git a/terraform/modules/sync_check/service/Gemfile.lock b/terraform/modules/sync_check/service/Gemfile.lock deleted file mode 100644 index 9832df215..000000000 --- a/terraform/modules/sync_check/service/Gemfile.lock +++ /dev/null @@ -1,41 +0,0 @@ -GEM - remote: https://rubygems.org/ - specs: - docker-api (2.2.0) - excon (>= 0.47.0) - multi_json - excon (0.99.0) - faraday (2.7.4) - faraday-net_http (>= 2.0, < 3.1) - ruby2_keywords (>= 0.0.4) - faraday-mashify (0.1.1) - faraday (~> 2.0) - hashie - faraday-multipart (1.0.4) - multipart-post (~> 2) - faraday-net_http (3.0.2) - ffi (1.15.5) - gli (2.21.0) - hashie (5.0.0) - multi_json (1.15.0) - multipart-post (2.3.0) - ruby2_keywords (0.0.5) - slack-ruby-client (2.1.0) - faraday (>= 2.0) - faraday-mashify - faraday-multipart - gli - hashie - sys-filesystem (1.4.3) - ffi (~> 1.1) - -PLATFORMS - x86_64-linux - -DEPENDENCIES - docker-api (>= 2.2.0) - slack-ruby-client (>= 2.1.0) - sys-filesystem (>= 1.4.3) - -BUNDLED WITH - 2.3.4 diff --git a/terraform/modules/sync_check/service/docker-compose.yml b/terraform/modules/sync_check/service/docker-compose.yml deleted file mode 100644 index 51547105d..000000000 --- a/terraform/modules/sync_check/service/docker-compose.yml +++ /dev/null @@ -1,118 +0,0 @@ -# Docker compose file to run continuous Forest sync on both mainnet and calibnet. - -version: "3.7" - -services: - forest_mainnet: - image: ghcr.io/chainsafe/forest:${FOREST_TAG} - hostname: forest-mainnet - container_name: forest-mainnet - networks: - - mainnet - volumes: - - type: volume - source: forest-data - target: ${FOREST_TARGET_DATA} - - type: volume - source: sync-check - target: ${FOREST_TARGET_SCRIPTS} - command: - - '--chain' - - 'mainnet' - - '--config' - - ${FOREST_TARGET_SCRIPTS}/sync_check.toml - - '--auto-download-snapshot' - - '--save-token' - - '/tmp/admin_token' - environment: - FOREST_GC_TRIGGER_FACTOR: "1.4" - restart: unless-stopped - labels: - com.centurylinklabs.watchtower.enable: true - forest_calibnet: - image: ghcr.io/chainsafe/forest:${FOREST_TAG} - hostname: forest-calibnet - container_name: forest-calibnet - networks: - - calibnet - volumes: - - type: volume - source: forest-data - target: ${FOREST_TARGET_DATA} - - type: volume - source: sync-check - target: ${FOREST_TARGET_SCRIPTS} - command: - - '--chain' - - 'calibnet' - - '--config' - - ${FOREST_TARGET_SCRIPTS}/sync_check.toml - - '--auto-download-snapshot' - - '--save-token' - - '/tmp/admin_token' - environment: - FOREST_GC_TRIGGER_FACTOR: "1.2" - restart: unless-stopped - labels: - com.centurylinklabs.watchtower.enable: true - # Probe container to validate Forest syncing. Needs to be on the same network. - forest_tester: - build: - context: . - dockerfile: Dockerfile-tester - container_name: forest-tester - privileged: true - networks: - - mainnet - - calibnet - volumes: - - type: volume - source: forest-data - target: ${FOREST_TARGET_DATA} - - type: volume - read_only: true - source: sync-check - target: ${FOREST_TARGET_SCRIPTS} - # Put common Ruby utils into a path that should be by default in Ruby PATH - - type: volume - read_only: true - source: ruby-common - target: /usr/local/share/ruby/site_ruby/cs_utils - - /var/run/docker.sock:/var/run/docker.sock - environment: - - LOG_DIR=${FOREST_TARGET_DATA} - - SCRIPTS_DIR=${FOREST_TARGET_SCRIPTS} - - FOREST_SLACK_API_TOKEN=${FOREST_SLACK_API_TOKEN} - - FOREST_SLACK_NOTIF_CHANNEL=${FOREST_SLACK_NOTIF_CHANNEL} - - FOREST_TARGET_DATA=${FOREST_TARGET_DATA} - - FOREST_TARGET_SCRIPTS=${FOREST_TARGET_SCRIPTS} - - FOREST_TAG=${FOREST_TAG} - entrypoint: ["/bin/bash", "-c"] - command: - - | - ruby ${FOREST_TARGET_SCRIPTS}/sync_check.rb forest-mainnet & - ruby ${FOREST_TARGET_SCRIPTS}/sync_check.rb forest-calibnet & - wait - sleep infinity - depends_on: - - forest_mainnet - - forest_calibnet - restart: unless-stopped - labels: - com.centurylinklabs.watchtower.enable: true - com.centurylinklabs.watchtower.depends-on: "forest-mainnet,forest-calibnet" - -volumes: - forest-data: - external: true - name: forest-data - sync-check: - external: true - name: sync-check - ruby-common: - external: true - name: ruby-common - -networks: - mainnet: - calibnet: diff --git a/terraform/modules/sync_check/service/forest-env.tpl b/terraform/modules/sync_check/service/forest-env.tpl deleted file mode 100644 index 4de4892be..000000000 --- a/terraform/modules/sync_check/service/forest-env.tpl +++ /dev/null @@ -1,9 +0,0 @@ -export FOREST_TARGET_DATA="${FOREST_TARGET_DATA}" -export FOREST_TARGET_SCRIPTS="${FOREST_TARGET_SCRIPTS}" -export FOREST_TARGET_RUBY_COMMON="${FOREST_TARGET_RUBY_COMMON}" -export FOREST_SLACK_API_TOKEN="${slack_token}" -export FOREST_SLACK_NOTIF_CHANNEL="${slack_channel}" -export NEW_RELIC_API_KEY="${NEW_RELIC_API_KEY}" -export NEW_RELIC_ACCOUNT_ID="${NEW_RELIC_ACCOUNT_ID}" -export NEW_RELIC_REGION="${NEW_RELIC_REGION}" -export FOREST_TAG="${forest_tag}" diff --git a/terraform/modules/sync_check/service/health_check.sh b/terraform/modules/sync_check/service/health_check.sh deleted file mode 100755 index 1cfe8bd8c..000000000 --- a/terraform/modules/sync_check/service/health_check.sh +++ /dev/null @@ -1,112 +0,0 @@ -#!/bin/bash - -# Script to check health status of a running node. -# The only prerequisite here is that the `forest` process is running. -# The script will wait till metrics endpoint becomes available. -# Input: Forest hostname - -# Exit codes -RET_OK=0 -RET_SYNC_TIPSET_STALE=1 -RET_SYNC_ERROR=2 -RET_SYNC_TIMEOUT=3 -RET_HOSTNAME_NOT_SET=4 - -if [ $# -eq 0 ]; then - echo "No arguments supplied. Need to provide Forest hostname, e.g. forest-mainnet." - exit "$RET_HOSTNAME_NOT_SET" -fi - -# Governs how long the health check will run to assert Forest condition -HEALTH_CHECK_DURATION_SECONDS=${HEALTH_CHECK_DURATION_SECONDS:-"360"} -# Forest metrics endpoint path -FOREST_METRICS_ENDPOINT=${FOREST_METRICS_ENDPOINT:-"http://$1:6116/metrics"} -# Initial sync timeout (in seconds) after which the health check will fail -HEALTH_CHECK_SYNC_TIMEOUT_SECONDS=${HEALTH_CHECK_SYNC_TIMEOUT_SECONDS:-"7200"} - -# Extracts metric value from the metric data -# Arg: name of the metric -function get_metric_value() { - grep -E "^$1" <<< "$metrics" | cut -d' ' -f2 -} - -# Updates metrics data with the latest metrics from Prometheus -# Arg: none -function update_metrics() { - metrics=$(curl --silent "$FOREST_METRICS_ENDPOINT") -} - -# Checks if an error occurred and is visible in the metrics. -# Arg 1: name of the error metric -# Arg 2: maximum number of occurrences for the assertion to pass (0 for strictly not pass) -function assert_error() { - errors="$(get_metric_value "$1")" - if [[ "$errors" -gt "$2" ]]; then - echo "❌ $1: $errors (max: $2)" - ret=$RET_SYNC_ERROR - fi -} - -##### Actual script - -# Wait for Forest to start syncing -# Excluding `tipset_start` from the unbound variable check -set +u -timeout="$HEALTH_CHECK_SYNC_TIMEOUT_SECONDS" -echo "⏳ Waiting for Forest to start syncing (up to $timeout seconds)..." -until [ -n "$tipset_start" ] || [ "$timeout" -le 0 ] -do - update_metrics - tipset_start="$(get_metric_value "last_validated_tipset_epoch")" - sleep 1 - timeout=$((timeout-1)) -done -# Re-enabling the unbound variable check -set -u - -if [ "$timeout" -le 0 ]; then - echo "❌ Timed out on sync wait" - exit "$RET_SYNC_TIMEOUT" -fi -echo "✅ Forest started syncing" - -# Let Forest run for the health check period -echo "⏳ Waiting for the health probe to finish..." -sleep "$HEALTH_CHECK_DURATION_SECONDS" - -# Grab last synced tipset epoch -update_metrics -tipset_end="$(get_metric_value "last_validated_tipset_epoch")" - -if [ -z "$tipset_end" ]; then - echo "❌ Did not manage to get sync status" - exit "$RET_SYNC_ERROR" -fi - -# Assert tipset epoch moved forward -echo "👉 Tipset start: $tipset_start, end: $tipset_end" -if [ "$tipset_end" -gt "$tipset_start" ]; then - echo "✅ Tipset epoch moving forward" - ret="$RET_OK" -else - echo "❌ Tipset epoch didn't move forward." - ret="$RET_SYNC_TIPSET_STALE" -fi - -# Assert there are no sync errors -assert_error "network_head_evaluation_errors" 0 -assert_error "bootstrap_errors" 2 -assert_error "follow_network_interruptions" 0 -assert_error "follow_network_errors" 0 - -if [ "$ret" -ne "$RET_SYNC_ERROR" ]; then - echo "✅ No sync errors" -fi - -if [ "$ret" -eq "$RET_OK" ]; then - echo "✅ Health check passed" -else - echo "❌ Health check failed" -fi - -exit "$ret" diff --git a/terraform/modules/sync_check/service/init.sh b/terraform/modules/sync_check/service/init.sh deleted file mode 100755 index d96561371..000000000 --- a/terraform/modules/sync_check/service/init.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -## Enable strict error handling, command tracing, and pipefail -set -eux - -# Wait for cloud-init to finish initializing the machine -cloud-init status --wait - -# Setting DEBIAN_FRONTEND to ensure non-interactive operations for APT -export DEBIAN_FRONTEND=noninteractive - -# Using timeout to ensure the script retries if the APT servers are temporarily unavailable. -timeout 10m bash -c 'until apt-get -qqq --yes update && \ - apt-get -qqq --yes install ruby ruby-dev gcc make; do sleep 10; \ -done' - -gem install slack-ruby-client sys-filesystem - -nohup /bin/bash ./run_service.sh > run_service_log.txt & - -if [ -n "$NEW_RELIC_API_KEY" ] ; then - curl -Ls https://download.newrelic.com/install/newrelic-cli/scripts/install.sh | bash && \ - sudo NEW_RELIC_API_KEY="$NEW_RELIC_API_KEY" \ - NEW_RELIC_ACCOUNT_ID="$NEW_RELIC_ACCOUNT_ID" \ - NEW_RELIC_REGION="$NEW_RELIC_REGION" \ - /usr/local/bin/newrelic install -y - -# The provided configurations are specific to New Relic. To gain a deeper understanding of these configuration details, you can visit: -# https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent/configuration/infrastructure-agent-configuration-settings/#offline-time-to-reset -cat >> /etc/newrelic-infra.yml < /etc/newrelic-infra/logging.d/logging.yml < /dev/null || true -docker container rm --force forest-calibnet 2> /dev/null || true -docker container rm --force forest-mainnet 2> /dev/null || true -docker container rm --force forest-tester 2> /dev/null || true - -## Ensure watchtower is running -docker stop watchtower 2> /dev/null || true -docker wait watchtower 2> /dev/null || true -docker run \ - --detach \ - --restart unless-stopped \ - --privileged \ - -v /var/run/docker.sock:/var/run/docker.sock \ - --name watchtower \ - containrrr/watchtower \ - --label-enable --include-stopped --revive-stopped --stop-timeout 120s --interval 600 - -## We need it to access the DATA_DIR regardless of the user. -chmod 0777 /var/lib/docker/volumes/forest-data/_data - -## Ensure volumes are clean -rm -rf /var/lib/docker/volumes/forest-data/_data/* -rm -rf /var/lib/docker/volumes/sync-check/_data/* -rm -rf /var/lib/docker/volumes/ruby-common/_data/* - -## Copy all relevant scripts -cp --recursive /root/* /var/lib/docker/volumes/sync-check/_data/ -cp --recursive /root/ruby_common/* /var/lib/docker/volumes/ruby-common/_data/ - -## Run health check status of a running node -ruby sync_check_process.rb diff --git a/terraform/modules/sync_check/service/sync_check.rb b/terraform/modules/sync_check/service/sync_check.rb deleted file mode 100755 index 3e939bde4..000000000 --- a/terraform/modules/sync_check/service/sync_check.rb +++ /dev/null @@ -1,62 +0,0 @@ -# frozen_string_literal: true - -require_relative 'ruby_common/slack_client' -require_relative 'ruby_common/docker_utils' -require_relative 'ruby_common/utils' -require_relative 'sync_check_process' -require 'logger' -require 'fileutils' - -# Retrieves an environmental variable, failing if its not set or empty. -def get_and_assert_env_variable(name) - var = ENV[name] - raise "Please set #{name} environmental variable" if var.nil? || var.empty? - - var -end - -SLACK_TOKEN = get_and_assert_env_variable 'FOREST_SLACK_API_TOKEN' -CHANNEL = get_and_assert_env_variable 'FOREST_SLACK_NOTIF_CHANNEL' -SCRIPTS_DIR = get_and_assert_env_variable 'SCRIPTS_DIR' -LOG_DIR = get_and_assert_env_variable 'LOG_DIR' -TARGET_DATA = get_and_assert_env_variable 'FOREST_TARGET_DATA' - -hostname = ARGV[0] -raise 'No arguments supplied. Please provide Forest hostname, e.g. forest-mainnet' if ARGV.empty? - -network = hostname.match(/-(\w+)$/)[1] - -# Current datetime, to append to the log files -DATE = Time.new.strftime '%FT%H:%M:%S' -LOG_HEALTH = "#{LOG_DIR}/#{hostname}_#{DATE}_health" -LOG_FOREST = "#{LOG_DIR}/#{hostname}_#{DATE}_forest" -LOG_SYNC = "#{LOG_DIR}/#{hostname}_#{DATE}_sync" - -# Create log directory -FileUtils.mkdir_p LOG_DIR - -logger = Logger.new(LOG_SYNC) - -begin - # Run the actual health check - logger.info 'Running the health check...' - health_check_passed = system("bash #{SCRIPTS_DIR}/health_check.sh #{hostname} > #{LOG_HEALTH} 2>&1") - logger.info 'Health check finished' - - # Save the log capture from the Forest container - container_logs = DockerUtils.get_container_logs hostname - File.write(LOG_FOREST, container_logs) -ensure - client = SlackClient.new CHANNEL, SLACK_TOKEN - - if health_check_passed - client.post_message "✅ Sync check for #{hostname} passed. 🌲🌳🌲🌳🌲" - else - client.post_message "⛔ Sync check for #{hostname} fiascoed. 🔥🌲🔥" - FileUtils.rm_rf("#{TARGET_DATA}/#{network}") - logger.info 'DB Destroyed' - end - client.attach_files(LOG_HEALTH, LOG_SYNC, LOG_FOREST) -end - -logger.info 'Sync check finished' diff --git a/terraform/modules/sync_check/service/sync_check.toml b/terraform/modules/sync_check/service/sync_check.toml deleted file mode 100644 index fe5e5b082..000000000 --- a/terraform/modules/sync_check/service/sync_check.toml +++ /dev/null @@ -1,4 +0,0 @@ -[client] -data_dir = "/volumes/forest_data" -encrypt_keystore = false -metrics_address = "0.0.0.0:6116" diff --git a/terraform/modules/sync_check/service/sync_check_process.rb b/terraform/modules/sync_check/service/sync_check_process.rb deleted file mode 100755 index 9751d7c33..000000000 --- a/terraform/modules/sync_check/service/sync_check_process.rb +++ /dev/null @@ -1,123 +0,0 @@ -# frozen_string_literal: true - -require_relative 'ruby_common/slack_client' -require_relative 'ruby_common/utils' - -require 'English' -require 'fileutils' -require 'sys/filesystem' -require 'logger' -require 'open3' - -SLACK_TOKEN = get_and_assert_env_variable 'FOREST_SLACK_API_TOKEN' -CHANNEL = get_and_assert_env_variable 'FOREST_SLACK_NOTIF_CHANNEL' -FOREST_DATA = get_and_assert_env_variable 'FOREST_TARGET_DATA' -FOREST_SCRIPTS = get_and_assert_env_variable 'FOREST_TARGET_SCRIPTS' -FOREST_TAG = get_and_assert_env_variable 'FOREST_TAG' - -# Sync check class encompassing all required methods and fields -class SyncCheck - def initialize(slack_client = nil) - @logger = Logger.new($stdout) - @client = slack_client || SlackClient.new(CHANNEL, SLACK_TOKEN) - end - - # Runs a command with an arbitrary binary available in the chainsafe/forest image - def run_forest_container(binary, command) - @logger.debug "Running `#{binary}` command with #{command}" - stdout, stderr, status = Open3.capture3("docker run --entrypoint #{binary} \ - --init \ - --volume forest-data:#{FOREST_DATA} \ - --volume sync-check:#{FOREST_SCRIPTS} \ - --rm \ - ghcr.io/chainsafe/forest:#{FOREST_TAG} \ - --config #{FOREST_SCRIPTS}/sync_check.toml \ - #{command}") - raise "Failed `#{binary} #{command}`.\n```\nSTDOUT:\n#{stdout}\nSTDERR:\n#{stderr}```" unless status.success? - end - - # Runs a command for forest-tool. The configuration is pre-defined. - def run_forest_tool(command) - run_forest_container('forest-tool', command) - end - - # Runs a command for forest node. The configuration is pre-defined. - def run_forest(command) - run_forest_container('forest', command) - end - - # Gets current disk usage. - def disk_usage - stat = Sys::Filesystem.stat('/') - 1 - stat.blocks_available.fdiv(stat.blocks) - end - - # Starts docker compose services. - def start_services - @logger.info 'Starting services' - `docker compose up --build --force-recreate --detach` - raise 'Failed to start services' unless $CHILD_STATUS.success? - end - - # Stops docker compose services - def stop_services - @logger.info 'Stopping services' - `docker compose down` - raise 'Failed to stop services' unless $CHILD_STATUS.success? - end - - # Checks if the docker compose services are up - def services_up? - output = `docker compose ps --services --filter "status=running"` - $CHILD_STATUS.success? && !output.strip.empty? - end - - # logs and sends a slack message containing the error description - def report_error(error) - @logger.error error.message - @client.post_message '💀 Sync check fiasco ❌' - @client.attach_comment error.message - end - - # Cleans up the sync check - def cleanup - @logger.info 'Cleaning up sync check' - @client.post_message '🧹 Cleaning up sync check' - - stop_services - cleanup_command = "docker run --rm --volume forest-data:#{FOREST_DATA} busybox sh -c 'rm -rf #{FOREST_DATA}/**'" - - stdout, stderr, status = Open3.capture3(cleanup_command) - unless status.success? - error_message = "Cleanup failed with status: #{status.exitstatus}. STDOUT: #{stdout}, STDERR: #{stderr}" - @logger.error error_message - @client.attach_comment "Cleanup error: #{error_message}" - raise 'Failed to clean up Docker volume' - else - @logger.info 'Cleanup successful' - @client.attach_comment '🧹 Docker volume cleanup completed successfully ✅' - end - - @client.attach_comment '🧹 Cleanup finished ✅' - end - - # start the sync check loop - def run - loop do - begin - `docker image prune -f` - cleanup unless disk_usage < 0.85 - start_services unless services_up? - rescue StandardError => e - report_error e - end - - # sleep 1 hour before checking again - sleep 60 * 60 - end - end -end - -##### -# Runs only when executed directly -SyncCheck.new.run if __FILE__ == $PROGRAM_NAME diff --git a/terraform/modules/sync_check/variable.tf b/terraform/modules/sync_check/variable.tf deleted file mode 100644 index 0d5760e21..000000000 --- a/terraform/modules/sync_check/variable.tf +++ /dev/null @@ -1,64 +0,0 @@ -variable "digitalocean_token" { - description = "Token for authentication." - type = string - sensitive = true -} - -variable "name" { - description = "The name of Forest Droplet" - type = string -} - -variable "size" { - description = "The size of the droplet instance to launch" - type = string -} - -variable "slack_channel" { - description = "slack channel name for notifications" - type = string -} - -variable "slack_token" { - description = "slack access token" - type = string - sensitive = true -} - -variable "image" { - description = "The ID of the AMI to use for the Droplet" - type = string - default = "docker-20-04" -} - -variable "region" { - description = "The region where resources will be created" - type = string - default = "fra1" -} - -variable "project" { - description = "DigitalOcean project used as parent for the created droplet" - type = string - default = "Forest-DEV" # Alternative: "Default" -} - -variable "NEW_RELIC_REGION" { - description = "The New Relic Platform Region" - type = string - default = "EU" -} - -variable "NEW_RELIC_API_KEY" { - description = "New Relic API KEY" - default = "" - type = string - sensitive = true -} - -variable "NEW_RELIC_ACCOUNT_ID" { - description = "The New Relic Account ID" - default = "" - type = string - sensitive = true -} diff --git a/terraform/sync_check/.terraform.lock.hcl b/terraform/sync_check/.terraform.lock.hcl deleted file mode 100644 index 12db55d29..000000000 --- a/terraform/sync_check/.terraform.lock.hcl +++ /dev/null @@ -1,64 +0,0 @@ -# This file is maintained automatically by "terraform init". -# Manual edits may be lost in future updates. - -provider "registry.terraform.io/digitalocean/digitalocean" { - version = "2.28.1" - constraints = "~> 2.0" - hashes = [ - "h1:aSxcSWa0wQQGLW2/XyivecmR/BL9fVtw42Bg2UngmT0=", - "zh:2e22294110ddfc4cd8c51e342f56788175d02b2bb311f1834f3c144a80dc30dc", - "zh:59641f0c7b10befced370008a3178670ee1103fcb504a9b71f90d6f697738fc2", - "zh:5d4c48701dbf3316cc149a01e44bed8cacb4426d4981a415ca1149a26af608af", - "zh:5fc27f1948669378d5f1d0cd0352fed92a3516738cb4bdeff026ea9242e364b0", - "zh:673bb803646c359809db39ccead45c1ea5ab12b47dd4dd14a576fe5c6300e386", - "zh:67e212f02ac0acdfc9d448299acefc3e4ea6aaad9a229f7f97e9064e1512a33f", - "zh:6dcd108fb68ce1b1cbfe9d4c4df0f0f4c3c09bd6154724df9bd33e1630cd6e0b", - "zh:757e4b2f3c728a6b781521e8f7e47ed8bd7bf189f5e273d953a0d849463d9af5", - "zh:7ae1a6cb34c45a00f84090ad2dcc7eac23fd748a026801bfe19e60a96893882f", - "zh:7fecfea5b2b2e79ee1c49b824995ed5232e7f538fdfe33aa984a1bda90bf1587", - "zh:8e5671bd7cbc2e45e3e9e4ec803e8d9b671f687c66963f52bf1bf58cb7b05819", - "zh:a6a5b504c95eff173e2ac017523a97e3300d93fa887465b0278c2633dd0cc608", - "zh:cd273e7c690e6758761f583a35f76d0e86259ccc842be137931e4cf9d8c3d1cc", - "zh:e815c665ed5c32057d1f4dc0288b6387ba38b2a5bf5ef0c36987810a141d3b39", - "zh:ebac3d11f2e968f88f95e3f277b547f505ecd7150df0fd763087499b251bca9e", - "zh:fb970aa84783edc03ea4ec53d2b896721fbab08b4b764a284e3d7eb49bfa046e", - ] -} - -provider "registry.terraform.io/hashicorp/external" { - version = "2.3.1" - hashes = [ - "h1:bROCw6g5D/3fFnWeJ01L4IrdnJl1ILU8DGDgXCtYzaY=", - "zh:001e2886dc81fc98cf17cf34c0d53cb2dae1e869464792576e11b0f34ee92f54", - "zh:2eeac58dd75b1abdf91945ac4284c9ccb2bfb17fa9bdb5f5d408148ff553b3ee", - "zh:2fc39079ba61411a737df2908942e6970cb67ed2f4fb19090cd44ce2082903dd", - "zh:472a71c624952cff7aa98a7b967f6c7bb53153dbd2b8f356ceb286e6743bb4e2", - "zh:4cff06d31272aac8bc35e9b7faec42cf4554cbcbae1092eaab6ab7f643c215d9", - "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:7ed16ccd2049fa089616b98c0bd57219f407958f318f3c697843e2397ddf70df", - "zh:842696362c92bf2645eb85c739410fd51376be6c488733efae44f4ce688da50e", - "zh:8985129f2eccfd7f1841ce06f3bf2bbede6352ec9e9f926fbaa6b1a05313b326", - "zh:a5f0602d8ec991a5411ef42f872aa90f6347e93886ce67905c53cfea37278e05", - "zh:bf4ab82cbe5256dcef16949973bf6aa1a98c2c73a98d6a44ee7bc40809d002b8", - "zh:e70770be62aa70198fa899526d671643ff99eecf265bf1a50e798fc3480bd417", - ] -} - -provider "registry.terraform.io/hashicorp/local" { - version = "2.4.0" - hashes = [ - "h1:R97FTYETo88sT2VHfMgkPU3lzCsZLunPftjSI5vfKe8=", - "zh:53604cd29cb92538668fe09565c739358dc53ca56f9f11312b9d7de81e48fab9", - "zh:66a46e9c508716a1c98efbf793092f03d50049fa4a83cd6b2251e9a06aca2acf", - "zh:70a6f6a852dd83768d0778ce9817d81d4b3f073fab8fa570bff92dcb0824f732", - "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", - "zh:82a803f2f484c8b766e2e9c32343e9c89b91997b9f8d2697f9f3837f62926b35", - "zh:9708a4e40d6cc4b8afd1352e5186e6e1502f6ae599867c120967aebe9d90ed04", - "zh:973f65ce0d67c585f4ec250c1e634c9b22d9c4288b484ee2a871d7fa1e317406", - "zh:c8fa0f98f9316e4cfef082aa9b785ba16e36ff754d6aba8b456dab9500e671c6", - "zh:cfa5342a5f5188b20db246c73ac823918c189468e1382cb3c48a9c0c08fc5bf7", - "zh:e0e2b477c7e899c63b06b38cd8684a893d834d6d0b5e9b033cedc06dd7ffe9e2", - "zh:f62d7d05ea1ee566f732505200ab38d94315a4add27947a60afa29860822d3fc", - "zh:fa7ce69dde358e172bd719014ad637634bbdabc49363104f4fca759b4b73f2ce", - ] -} diff --git a/terraform/sync_check/README.md b/terraform/sync_check/README.md deleted file mode 100644 index 2e0eba8c5..000000000 --- a/terraform/sync_check/README.md +++ /dev/null @@ -1,86 +0,0 @@ -# Overview - -This folder contains an infrastructure configuration that simplifies the setup and automatic initiation of the Forest Sync-Check service on a DigitalOcean droplet. The configuration is specifically designed to perform sync checks on both the Calibnet and Mainnet networks. It also sends notifications to the Forest Slack notification channel. Moreover, the sync check service is configured to automatically restart upon droplet reboot, and the New Relic Infrastructure agent is installed for monitoring purposes - -# Workflow - -Changing any of the settings (such as the size of the droplet or the operating -system) will automatically re-deploy the service. The same is true for changing -any of the scripts. - -The sync check is configured using `restart unless-stopped` docker flag, -which restart automatically upon droplet reboot. - -The workflow has access to all the required secrets (DO token, slack token, S3 -credentials) and none of them have to be provided when creating a new PR. -However, the deployment workflow is not triggered automatically if you change -the secrets. In this case, you have to trigger the workflow manually. - -# Manual deployments - -To manually deploy the service (useful for testing and debugging), you first -need to set the following environment variables (you will be prompted later if -you don't set these variables): - -## Required environment variables - -```bash -# DigitalOcean personal access token: https://cloud.digitalocean.com/account/api/tokens -export TF_VAR_do_token= -# Slack access token: https://api.slack.com/apps -export TF_VAR_slack_token= -# S3 access keys used by the snapshot service. Can be generated here: https://cloud.digitalocean.com/account/api/spaces -export TF_VAR_AWS_ACCESS_KEY_ID= -export TF_VAR_AWS_SECRET_ACCESS_KEY= -# S3 access keys used by terraform, use the same values as above -export AWS_ACCESS_KEY_ID= -export AWS_SECRET_ACCESS_KEY= - -# Optional, only if you want install new relic agent -# New Relic License key, Can be generated here: https://one.eu.newrelic.com/admin-portal/api-keys/home -export TF_VAR_NEW_RELIC_API_KEY= -export TF_VAR_NEW_RELIC_ACCOUNT_ID= -``` - -Forest tokens can be found on 1password. - -you'll also need to link your public key with Digital Ocean. To do this, visit https://cloud.digitalocean.com/account/security. Additionally, set up your SSH key by following the commands provided below: - -```bash -eval $(ssh-agent) - -ssh-add -``` - -To ensure the production Snapshot service remains intact, modify certain variables in the `Main.tf` file: - -- Change `key = "sync_check.tfstate"` to `key = ".tfstate"`. -- Replace `name = "forest-sync-check"` with `name = ""`. -- Replace ` slack_channel = "#forest-notifications"` with `slack_channel = "#forest-dump"` - -Remember to replace ``, ``, and `` with appropriate values. - -To prepare terraform for other commands: -```bash -$ terraform init -``` - -To inspect a new deployment plan (it'll tell you which servers will be removed, -added, etc.): -```bash -$ terraform plan -``` -For Mac users, if you encounter the `Error: External Program Execution Failed`, you'll need to adjust the `prep_sources.sh` file located in the `../modules/sync_check` directory. Make the following changes: - -- Replace `--archive` with `-Rp`. -- Install `gnu-tar` using the command `brew install gnu-tar`. Afterward, switch `tar cf ../sources.tar` to `gtar cf ../sources.tar` - -To deploy the service: -```bash -$ terraform apply -``` - -To shutdown the service: -```bash -$ terraform destroy -``` diff --git a/terraform/sync_check/main.tf b/terraform/sync_check/main.tf deleted file mode 100644 index 9fb85069d..000000000 --- a/terraform/sync_check/main.tf +++ /dev/null @@ -1,49 +0,0 @@ -terraform { - required_version = "~> 1.3" - - backend "s3" { - # Note: This is the bucket for the internal terraform state. This bucket is - # completely independent from the bucket that contains snapshots. - bucket = "forest-iac" - # This key uniquely identifies the service. To create a new service (instead - # of modifying this one), use a new key. Unfortunately, variables may not be - # used here. - key = "sync_check.tfstate" - - # This value is completely unused by DO but _must_ be a known AWS region. - region = "us-west-1" - # The S3 region is determined by the endpoint. fra1 = Frankfurt. - # This region does not have to be shared by the droplet. - endpoints = { - s3 = "https://fra1.digitaloceanspaces.com" - } - - # Credentially can be validated through the Security Token Service (STS). - # Unfortunately, DigitalOcean does not support STS so we have to skip the - # validation. - skip_credentials_validation = "true" - skip_requesting_account_id = "true" - skip_s3_checksum = "true" - } -} - -module "sync_check" { - # Import the sync_check module - source = "../modules/sync_check" - - # Configure service: - name = "forest-sync-check" # droplet name - size = "s-4vcpu-16gb-amd" # droplet size - slack_channel = "#forest-notifications" # slack channel for notifications - - # Variable passthrough: - slack_token = var.slack_token - digitalocean_token = var.do_token - NEW_RELIC_API_KEY = var.NEW_RELIC_API_KEY - NEW_RELIC_ACCOUNT_ID = var.NEW_RELIC_ACCOUNT_ID -} - -# This ip address may be used in the future by monitoring software -output "ip" { - value = [module.sync_check.ip] -} diff --git a/terraform/sync_check/variable.tf b/terraform/sync_check/variable.tf deleted file mode 100644 index 2e5b27545..000000000 --- a/terraform/sync_check/variable.tf +++ /dev/null @@ -1,23 +0,0 @@ -variable "do_token" { - description = "Token for authentication." - type = string - sensitive = true -} - -variable "slack_token" { - description = "slack access token" - type = string - sensitive = true -} - -variable "NEW_RELIC_API_KEY" { - description = "New Relic API KEY" - type = string - sensitive = true -} - -variable "NEW_RELIC_ACCOUNT_ID" { - description = "The New Relic Account ID" - type = string - sensitive = true -} From c3d0d958bd8b1d7b27fb6d16beb4122d647b0cea Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Tue, 16 Jan 2024 10:21:34 +0100 Subject: [PATCH 28/46] mv common to scripts --- tf-managed/{common => scripts}/ruby_common/docker_utils.rb | 0 tf-managed/{common => scripts}/ruby_common/slack_client.rb | 0 tf-managed/{common => scripts}/ruby_common/utils.rb | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename tf-managed/{common => scripts}/ruby_common/docker_utils.rb (100%) rename tf-managed/{common => scripts}/ruby_common/slack_client.rb (100%) rename tf-managed/{common => scripts}/ruby_common/utils.rb (100%) diff --git a/tf-managed/common/ruby_common/docker_utils.rb b/tf-managed/scripts/ruby_common/docker_utils.rb similarity index 100% rename from tf-managed/common/ruby_common/docker_utils.rb rename to tf-managed/scripts/ruby_common/docker_utils.rb diff --git a/tf-managed/common/ruby_common/slack_client.rb b/tf-managed/scripts/ruby_common/slack_client.rb similarity index 100% rename from tf-managed/common/ruby_common/slack_client.rb rename to tf-managed/scripts/ruby_common/slack_client.rb diff --git a/tf-managed/common/ruby_common/utils.rb b/tf-managed/scripts/ruby_common/utils.rb similarity index 100% rename from tf-managed/common/ruby_common/utils.rb rename to tf-managed/scripts/ruby_common/utils.rb From 4e3903230b626cee33ac723c6f55d24d7b947875 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Tue, 16 Jan 2024 10:21:53 +0100 Subject: [PATCH 29/46] more docs --- tf-managed/README.md | 41 +++++++++++++++++++++++++++++++--- tf-managed/live/terragrunt.hcl | 10 +++++---- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/tf-managed/README.md b/tf-managed/README.md index 316ed2ceb..52c03e3d6 100644 --- a/tf-managed/README.md +++ b/tf-managed/README.md @@ -5,7 +5,42 @@ This directory contains services and assets managed via Terraform/Terragrunt. # Structure ``` -├── common <- common code, shared between all modules (TODO maybe move it to modules?) -├── live <- configurations for different environments. -└── modules <- service and resources definitions +├── scripts # common code, shared between all modules +├── live # actual environment definitions, managed by Terragrunt +└── modules # Terraform modules, from which the environment is built ``` + +# Requirements + +### Software + +* [terraform](https://developer.hashicorp.com/terraform/install), +* [terraform](https://terragrunt.gruntwork.io/docs/getting-started/install/) + +For recommended versions, please refer to the workflow files # TODO: Put file + +### Secrets + +Refer to [environment README](./live/README.md) or module-specific README. + +# Adding new services + +1. Create a Terraform module and put it in [modules](./modules). A suggested structure of such module is: + * `main.tf` - the core resources around the service, + * `variable.tf` - inputs to the module, e.g., enable Slack notifications, + * `outputs.tf` - outputs of the module, e.g., created VPS IP, + * `provider.tf` - `terraform` and `provider` blocks to keep the versioning in one place, + * `service/` - directory with the actual service implementation. This will be changed in the future. + * Other files and directories based on needs, e.g., `monitoring` to generate monitoring resources. + +2. Create a Terragrunt service in your own development environment and assert that it works correctly: + * inside [live](./live), execute `make environment`. Go to that directory. + * inside the `applications/`, create your `fancy-app` directory and a `terragrunt.hcl` file. There you will invoke the created module with input variables. + * run `terragrunt plan` to assert that all variables are set correctly, and that the plan output matches your expectations, + * run `terragrunt apply` to apply the plan. + * perform necessary assertions (the resources are created, the server is responding to requests, monitoring outputs make sense) + * if all is good, teardown the service with `terragrunt destroy`. + +3. Copy the tested service to [dev](./live/environments/dev/applications) and to [prod](./live/environments/prod/applications). Remove your environment directory. + +4. Make a PR! diff --git a/tf-managed/live/terragrunt.hcl b/tf-managed/live/terragrunt.hcl index 45c337123..56ac84314 100644 --- a/tf-managed/live/terragrunt.hcl +++ b/tf-managed/live/terragrunt.hcl @@ -9,7 +9,7 @@ locals { env = local.parsed.env } -# Remote state, separate for each environment +# Remote state, separate for each environment and service. remote_state { backend = "s3" generate = { @@ -17,7 +17,9 @@ remote_state { if_exists = "overwrite_terragrunt" } config = { - // if the environment is dev, use the dev bucket, otherwise use the prod bucket + // Provide some basic separation between development and production environments. + // Ideally, we'd use separate accounts for each environment, but that's not + // feasible at the moment. bucket = (local.env == "prod" ? "forest-iac-bucket-prod" : "forest-iac-bucket-dev" @@ -41,9 +43,9 @@ remote_state { # Common inputs for all the services. inputs = { # The common resources dir contains common code that we want to share across all services. - # This is a legacy from the previous version of the infrastructure, and this will be removed + # This is a legacy from the previous version of the infrastructure, and will be removed # in the future. - common_resources_dir = format("%s/../common", get_parent_terragrunt_dir()) + common_resources_dir = format("%s/../scripts", get_parent_terragrunt_dir()) slack_channel = (local.env == "prod" ? "#forest-notifications" : "#forest-dump") environment = local.env } From 0d5cedfe375c28c0466ca2e87cc1f0c57a765ec7 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Tue, 16 Jan 2024 11:34:59 +0100 Subject: [PATCH 30/46] add helpers/docs --- tf-managed/README.md | 12 ++++++------ tf-managed/live/.gitignore | 1 + tf-managed/live/Makefile | 40 +++++++++++++++++++++++++++++++++++--- tf-managed/live/README.md | 4 ++-- 4 files changed, 46 insertions(+), 11 deletions(-) diff --git a/tf-managed/README.md b/tf-managed/README.md index 52c03e3d6..8c5b54028 100644 --- a/tf-managed/README.md +++ b/tf-managed/README.md @@ -26,19 +26,19 @@ Refer to [environment README](./live/README.md) or module-specific README. # Adding new services 1. Create a Terraform module and put it in [modules](./modules). A suggested structure of such module is: - * `main.tf` - the core resources around the service, - * `variable.tf` - inputs to the module, e.g., enable Slack notifications, - * `outputs.tf` - outputs of the module, e.g., created VPS IP, - * `provider.tf` - `terraform` and `provider` blocks to keep the versioning in one place, + * `main.tf` - the core resources around the service. + * `variable.tf` - inputs to the module, e.g., enable Slack notifications. + * `outputs.tf` - outputs of the module, e.g., created VPS IP. + * `provider.tf` - `terraform` and `provider` blocks to keep the versioning in one place. * `service/` - directory with the actual service implementation. This will be changed in the future. * Other files and directories based on needs, e.g., `monitoring` to generate monitoring resources. 2. Create a Terragrunt service in your own development environment and assert that it works correctly: - * inside [live](./live), execute `make environment`. Go to that directory. + * inside [live](./live), execute `make create-environment`. Go to that directory. * inside the `applications/`, create your `fancy-app` directory and a `terragrunt.hcl` file. There you will invoke the created module with input variables. * run `terragrunt plan` to assert that all variables are set correctly, and that the plan output matches your expectations, * run `terragrunt apply` to apply the plan. - * perform necessary assertions (the resources are created, the server is responding to requests, monitoring outputs make sense) + * perform necessary assertions (the resources are created, the server is responding to requests, monitoring outputs make sense). * if all is good, teardown the service with `terragrunt destroy`. 3. Copy the tested service to [dev](./live/environments/dev/applications) and to [prod](./live/environments/prod/applications). Remove your environment directory. diff --git a/tf-managed/live/.gitignore b/tf-managed/live/.gitignore index 4f2d3e9f5..8a151cc06 100644 --- a/tf-managed/live/.gitignore +++ b/tf-managed/live/.gitignore @@ -16,3 +16,4 @@ vendor # Personal development environments environments/dev-* +.dev_environment diff --git a/tf-managed/live/Makefile b/tf-managed/live/Makefile index bd34788f0..0a2a200a8 100644 --- a/tf-managed/live/Makefile +++ b/tf-managed/live/Makefile @@ -1,7 +1,41 @@ -# Creates a new environment for development from the base one. -environment: +# General-purpose Makefile for the managing the environments. + +DEV_ENVIRONMENT_FILE=.dev_environment + +# Creates a new environment. The name of the environment is generated +# randomly and stored in the .environment file. +${DEV_ENVIRONMENT_FILE}: $(eval export ENVIRONMENT=dev-$(shell cat /dev/urandom | tr -dc 'a-z0-9' | fold -w 8 | head -n 1)) @cp -r environments/dev environments/$(ENVIRONMENT) @echo "Environment: $(ENVIRONMENT). Happy hacking!" + @echo $(ENVIRONMENT) > ${DEV_ENVIRONMENT_FILE} + +# Creates a new environment. +create-environment: ${DEV_ENVIRONMENT_FILE} + +# Deploys the entire development environment. Feel free to remove the +# services you don't need. +deploy-dev: ${DEV_ENVIRONMENT_FILE} + $(eval export ENVIRONMENT=$(shell cat ${DEV_ENVIRONMENT_FILE})) + @test -n "$(ENVIRONMENT)" || (echo "ENVIRONMENT is not set" && exit 1) + @echo "Deploying $(ENVIRONMENT)..." + @cd environments/$(ENVIRONMENT) && terragrunt run-all apply + @echo "Environment $(ENVIRONMENT) deployed." + +# Tears down the entire development environment and removes the +# environment directory. +destroy-dev: ${DEV_ENVIRONMENT_FILE} + $(eval export ENVIRONMENT=$(shell cat ${DEV_ENVIRONMENT_FILE})) + @test -n "$(ENVIRONMENT)" || (echo "ENVIRONMENT is not set" && exit 1) + @echo "Destroying $(ENVIRONMENT)..." + @cd environments/$(ENVIRONMENT) && terragrunt run-all destroy + @rm -rf environments/$(ENVIRONMENT) + @rm ${DEV_ENVIRONMENT_FILE} + @echo "Environment $(ENVIRONMENT) destroyed." + +# Deploys the entire production environment. +deploy-prod: + @echo "Deploying to production..." + cd environments/prod && terragrunt run-all apply -.PHONY: environment +.PHONY: create-environment deploy-dev destroy-dev deploy-prod diff --git a/tf-managed/live/README.md b/tf-managed/live/README.md index 7762502c8..b4fd11948 100644 --- a/tf-managed/live/README.md +++ b/tf-managed/live/README.md @@ -7,7 +7,7 @@ The Terragrunt configurations manage the actual environments and, in principle, As a developer, you should create your own environment, separated from the others. In this directory, execute `make environment` and one will be created for you. Do not work on the `dev` environment directly as others may be working on it as well. ``` -❯ make environment +❯ make create-environment Environment: dev-7zryf85r. Happy hacking! ``` @@ -23,7 +23,7 @@ This should show you the resources to be changed/created/destroyed. After ensuring the changes work correctly, merge the changes from your development environment to the base one and, possibly, `prod`. -Remember to cleanup your environment. Use `terragrunt destroy`. +Remember to cleanup your environment. Use `terragrunt destroy` or use `make destroy-dev`. Refer to the [Makefile](./Makefile) for details. # Conventions From d41764b400e54b23dc79f4cd440ce7a55062e577 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Tue, 16 Jan 2024 11:49:05 +0100 Subject: [PATCH 31/46] match prod --- .../snapshot-monitoring/terragrunt.hcl | 10 +++++++++ .../snapshot-service/terragrunt.hcl | 22 +++++++++++++++++++ .../applications/sync-check/terragrunt.hcl | 9 +++----- 3 files changed, 35 insertions(+), 6 deletions(-) create mode 100644 tf-managed/live/environments/prod/applications/snapshot-monitoring/terragrunt.hcl create mode 100644 tf-managed/live/environments/prod/applications/snapshot-service/terragrunt.hcl diff --git a/tf-managed/live/environments/prod/applications/snapshot-monitoring/terragrunt.hcl b/tf-managed/live/environments/prod/applications/snapshot-monitoring/terragrunt.hcl new file mode 100644 index 000000000..a92cb9f18 --- /dev/null +++ b/tf-managed/live/environments/prod/applications/snapshot-monitoring/terragrunt.hcl @@ -0,0 +1,10 @@ +# Automatically find the root terragrunt.hcl and inherit its +# configuration +include "root" { + path = find_in_parent_folders() +} + +# Load the actual Terraform module +terraform { + source = format("%s/../modules/snapshot-monitoring", get_parent_terragrunt_dir()) +} diff --git a/tf-managed/live/environments/prod/applications/snapshot-service/terragrunt.hcl b/tf-managed/live/environments/prod/applications/snapshot-service/terragrunt.hcl new file mode 100644 index 000000000..9438f7aa2 --- /dev/null +++ b/tf-managed/live/environments/prod/applications/snapshot-service/terragrunt.hcl @@ -0,0 +1,22 @@ +# Automatically find the root terragrunt.hcl and inherit its +# configuration +include "root" { + path = find_in_parent_folders() +} + +# Load the actual Terraform module +terraform { + source = format("%s/../modules/daily-snapshot", get_parent_terragrunt_dir()) +} + +inputs = { + name = "forest-snapshot" + size = "s-4vcpu-16gb-amd" + r2_endpoint = "https://2238a825c5aca59233eab1f221f7aefb.r2.cloudflarestorage.com/" + forest_tag = "v0.16.4" + snapshot_bucket = "forest-archive-dev" + + monitoring = { + enable = true, + } +} diff --git a/tf-managed/live/environments/prod/applications/sync-check/terragrunt.hcl b/tf-managed/live/environments/prod/applications/sync-check/terragrunt.hcl index 937c00e05..d8961db58 100644 --- a/tf-managed/live/environments/prod/applications/sync-check/terragrunt.hcl +++ b/tf-managed/live/environments/prod/applications/sync-check/terragrunt.hcl @@ -6,13 +6,10 @@ include "root" { # Load the actual Terraform module terraform { - source = format("%s/../modules/daily-snapshot", get_parent_terragrunt_dir()) + source = format("%s/../modules/sync-check", get_parent_terragrunt_dir()) } inputs = { - name = "forest-snapshot" - size = "s-4vcpu-16gb-amd" - r2_endpoint = "https://2238a825c5aca59233eab1f221f7aefb.r2.cloudflarestorage.com/" - forest_tag = "v0.16.4" - snapshot_bucket = "forest-archive" + name = "sync-check" + size = "s-4vcpu-16gb-amd" } From d96af8ff6179f4e933d62023979a34e883e32810 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Tue, 16 Jan 2024 12:47:30 +0100 Subject: [PATCH 32/46] hclfmt --- .../prod/applications/snapshot-service/terragrunt.hcl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tf-managed/live/environments/prod/applications/snapshot-service/terragrunt.hcl b/tf-managed/live/environments/prod/applications/snapshot-service/terragrunt.hcl index 9438f7aa2..25c028946 100644 --- a/tf-managed/live/environments/prod/applications/snapshot-service/terragrunt.hcl +++ b/tf-managed/live/environments/prod/applications/snapshot-service/terragrunt.hcl @@ -10,10 +10,10 @@ terraform { } inputs = { - name = "forest-snapshot" - size = "s-4vcpu-16gb-amd" - r2_endpoint = "https://2238a825c5aca59233eab1f221f7aefb.r2.cloudflarestorage.com/" - forest_tag = "v0.16.4" + name = "forest-snapshot" + size = "s-4vcpu-16gb-amd" + r2_endpoint = "https://2238a825c5aca59233eab1f221f7aefb.r2.cloudflarestorage.com/" + forest_tag = "v0.16.4" snapshot_bucket = "forest-archive-dev" monitoring = { From f30cac5d96929f864f4ad7da2520d9158359eed7 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Tue, 16 Jan 2024 13:02:22 +0100 Subject: [PATCH 33/46] tinker workflow --- .github/workflows/deploy-daily-snapshot.yml | 27 +-- .tflint.hcl | 6 +- composite-action/terragrunt/action.yml | 183 ++++++++++++++++++ .../snapshot-service/terragrunt.hcl | 8 +- tf-managed/modules/daily-snapshot/main.tf | 2 +- tf-managed/modules/sync-check/main.tf | 2 +- 6 files changed, 206 insertions(+), 22 deletions(-) create mode 100644 composite-action/terragrunt/action.yml diff --git a/.github/workflows/deploy-daily-snapshot.yml b/.github/workflows/deploy-daily-snapshot.yml index 224be8b77..332961d56 100644 --- a/.github/workflows/deploy-daily-snapshot.yml +++ b/.github/workflows/deploy-daily-snapshot.yml @@ -1,26 +1,28 @@ -name: Snapshot Service +name: Deploy Snapshot Service concurrency: ci-${{ github.ref }} on: pull_request: branches: - main - paths: - - 'terraform/daily_snapshot/**' - - 'terraform/modules/daily_snapshot/**' + # paths: + # - 'tf-managed/modules/daily_snapshot/**' + # - 'tf-managed/scripts/**' + # - 'tf-managed/live/environments/prod/applications/snapshot-service' # This needs to be declared explicitly so that the job is actually # run when moved out of draft. types: [opened, synchronize, reopened, ready_for_review] push: branches: - main - paths: - - 'terraform/daily_snapshot/**' - - 'terraform/modules/daily_snapshot/**' + # paths: + # - 'tf-managed/modules/daily_snapshot/**' + # - 'tf-managed/scripts/**' + # - 'tf-managed/live/environments/prod/applications/snapshot-service' workflow_dispatch: jobs: - deploy-daily-snapshot-calibnet: + deploy-daily-snapshot: name: Deploy runs-on: ubuntu-latest permissions: write-all @@ -28,18 +30,17 @@ jobs: - name: Checkout the code uses: actions/checkout@v4 - # Using Custom Composite action in ./composite-action/terraform folder - - name: Composite Action for Deploying Terraform Resources - uses: ./composite-action/terraform + # Using Custom Composite action in ./composite-action/terragrunt folder + - name: Composite Action for Deploying Terragrunt Resources + uses: ./composite-action/terragrunt with: do_token: ${{ secrets.DO_TOKEN }} aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} r2_access_key: ${{ secrets.R2_ACCESS_KEY }} r2_secret_key: ${{ secrets.R2_SECRET_KEY }} slack_token: ${{ secrets.SLACK_TOKEN }} - working_directory: terraform/daily_snapshot/prod + working_directory: tf-managed/live/environments/prod/applications/snapshot-service environment: Snapshot Service new_relic_account_id: ${{ secrets.NEW_RELIC_ACCOUNT_ID }} new_relic_api_key: ${{ secrets.NEW_RELIC_API_KEY }} diff --git a/.tflint.hcl b/.tflint.hcl index f8979ce8f..18e65e468 100644 --- a/.tflint.hcl +++ b/.tflint.hcl @@ -1,5 +1,5 @@ plugin "terraform" { - enabled = true - version = "0.2.2" - source = "github.com/terraform-linters/tflint-ruleset-terraform" + enabled = true + version = "0.5.0" + source = "github.com/terraform-linters/tflint-ruleset-terraform" } diff --git a/composite-action/terragrunt/action.yml b/composite-action/terragrunt/action.yml new file mode 100644 index 000000000..569d3312f --- /dev/null +++ b/composite-action/terragrunt/action.yml @@ -0,0 +1,183 @@ +name: Custom Composite action to deploy terragrunt resources + +description: | + This action deploys the Forest infrastructure with Terragrunt + +inputs: + environment: + description: 'The terraform plan for the the environment infrastructure to be deployed' + required: true + do_token: + description: 'The DigitalOcean access token to use for deploying the infrastructure' + required: true + aws_access_key_id: + description: 'S3 access keys id used by terraform and service like sync check, Deploy Snapshot Service etc' + required: true + aws_secret_access_key: + description: 'S3 secret access keys used by terraform and service like sync check, Deploy Snapshot Service etc' + required: true + working_directory: + description: 'The working Directory' + required: true + slack_token: + description: 'The slack token secret used to connect the Infrastructure to Slack' + new_relic_api_key: + description: 'The New Relic API KEY' + nr_license_key: + description: 'The New Relic Access Token' + new_relic_account_id: + description: 'The New Relic Platform Region' + r2_access_key: + description: 'CloudFlare R2 access key id' + r2_secret_key: + description: 'CloudFlare R2 private access key' + +runs: + using: "composite" + steps: + # Workaround for https://github.com/orgs/community/discussions/51280 + - name: Set TF/TG versions + shell: bash + run: | + echo "tf_version=1.6.6" >> $GITHUB_ENV + echo "tg_version=0.53.2" >> $GITHUB_ENV + + - name: Check terragrunt HCL + uses: gruntwork-io/terragrunt-action@v2 + with: + tf_version: ${{ env.tf_version }} + tg_version: ${{ env.tg_version }} + tg_dir: ${{ inputs.working_directory }} + tg_command: 'hclfmt --terragrunt-check --terragrunt-diff' + + - name: Validate + uses: gruntwork-io/terragrunt-action@v2 + with: + tf_version: ${{ env.tf_version }} + tg_version: ${{ env.tg_version }} + tg_dir: ${{ inputs.working_directory }} + tg_command: 'validate' + env: + AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} + AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} + + - name: Plan + if: github.event_name == 'pull_request' + uses: gruntwork-io/terragrunt-action@v2 + id: plan + with: + tf_version: ${{ env.tf_version }} + tg_version: ${{ env.tg_version }} + tg_dir: ${{ inputs.working_directory }} + tg_command: 'plan -no-color' + tg_comment: 1 + continue-on-error: true + env: + AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} + AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} + TF_VAR_digitalocean_token: ${{ inputs.do_token }} + TF_VAR_AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} + TF_VAR_AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} + TF_VAR_R2_ACCESS_KEY: ${{ inputs.r2_access_key }} + TF_VAR_R2_SECRET_KEY: ${{ inputs.r2_secret_key }} + TF_VAR_slack_token: ${{ inputs.slack_token }} + TF_VAR_new_relic_api_key: ${{ inputs.new_relic_api_key }} + TF_VAR_new_relic_account_id: ${{ inputs.new_relic_account_id }} + + - name: Plan output cleanup + if: always() + shell: bash + run: | + TG_OUT=$(echo '${{ steps.plan.outputs.tg_action_output }}' | sed 's|%0A|\n|g ; s|%3C|<|g') + echo "TG_PLAN_OUTPUT<> $GITHUB_ENV + echo "${TG_OUT:0:65300}" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + - name: Find Comment + if: github.event.pull_request.draft == true && + github.event_name == 'pull_request' + uses: peter-evans/find-comment@v2 + id: fc + with: + issue-number: ${{ github.event.pull_request.number }} + comment-author: 'github-actions[bot]' + body-regex: "^### Forest: ${{ inputs.environment }} Infrastructure Plan" + + + - name: Create or Update Comment + if: github.event.pull_request.draft == true && + github.event_name == 'pull_request' && + !contains(env.TG_PLAN_OUTPUT, 'No changes. Your infrastructure matches the configuration.') + uses: peter-evans/create-or-update-comment@v2 + with: + comment-id: ${{ steps.fc.outputs.comment-id }} + issue-number: ${{ github.event.pull_request.number }} + body: | + ### Forest: ${{ inputs.environment }} Infrastructure Plan: ${{ steps.plan.outcome }} + +
Show Plan + + ``` + ${{ env.TG_PLAN_OUTPUT }} + ``` + +
+ edit-mode: replace + + - name: Delete Comment + uses: detomarco/delete-comments@v1.0.4 + if: github.event.pull_request.draft == true && + github.event_name == 'pull_request' && + contains(env.TG_PLAN_OUTPUT, 'No changes. Your infrastructure matches the configuration.') + with: + comment-id: ${{ steps.fc.outputs.comment-id }} + + - name: Terraform Plan Status + shell: bash + if: steps.plan.tg_action_exit_code != 0 + run: exit 1 + # + # - name: Terraform Apply + # if: github.ref == 'refs/heads/main' && github.event_name == 'push' + # run: | + # if grep -q 'No changes.' tfplan; then + # echo "No changes detected." + # else + # echo "Changes detected. Redeploying everything..." + # terraform destroy -auto-approve -input=false + # terraform apply -auto-approve -input=false + # fi + # shell: bash + # working-directory: ${{ inputs.working_directory }} + # env: + # TF_VAR_do_token: ${{ inputs.do_token }} + # TF_VAR_AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} + # TF_VAR_AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} + # AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} + # AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} + # TF_VAR_slack_token: ${{ inputs.slack_token }} + # TF_VAR_R2_ACCESS_KEY: ${{ inputs.r2_access_key }} + # TF_VAR_R2_SECRET_KEY: ${{ inputs.r2_secret_key }} + # TF_VAR_NEW_RELIC_API_KEY: ${{ inputs.NEW_RELIC_API_KEY }} + # TF_VAR_NR_LICENSE_KEY: ${{ inputs.NR_LICENSE_KEY }} + # TF_VAR_NEW_RELIC_ACCOUNT_ID: ${{ inputs.new_relic_account_id }} + # + # - name: Terraform Force Apply + # if: github.ref == 'refs/heads/main' && github.event_name == 'workflow_dispatch' + # shell: bash + # working-directory: ${{ inputs.working_directory }} + # env: + # TF_VAR_do_token: ${{ inputs.do_token }} + # TF_VAR_AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} + # TF_VAR_AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} + # AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} + # AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} + # TF_VAR_R2_ACCESS_KEY: ${{ inputs.r2_access_key }} + # TF_VAR_R2_SECRET_KEY: ${{ inputs.r2_secret_key }} + # TF_VAR_slack_token: ${{ inputs.slack_token }} + # TF_VAR_NEW_RELIC_API_KEY: ${{ inputs.new_relic_api_key }} + # TF_VAR_NR_LICENSE_KEY: ${{ inputs.nr_license_key }} + # TF_VAR_NEW_RELIC_ACCOUNT_ID: ${{ inputs.new_relic_account_id }} + # run: | + # terraform destroy -auto-approve -input=false + # terraform apply -auto-approve -input=false diff --git a/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl b/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl index 61f168e46..75c7d00a3 100644 --- a/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl +++ b/tf-managed/live/environments/dev/applications/snapshot-service/terragrunt.hcl @@ -10,10 +10,10 @@ terraform { } inputs = { - name = "forest-snapshot" - size = "s-4vcpu-16gb-amd" - r2_endpoint = "https://2238a825c5aca59233eab1f221f7aefb.r2.cloudflarestorage.com/" - forest_tag = "latest" + name = "forest-snapshot" + size = "s-4vcpu-16gb-amd" + r2_endpoint = "https://2238a825c5aca59233eab1f221f7aefb.r2.cloudflarestorage.com/" + forest_tag = "latest" snapshot_bucket = "forest-archive-dev" monitoring = { diff --git a/tf-managed/modules/daily-snapshot/main.tf b/tf-managed/modules/daily-snapshot/main.tf index 26b343043..fd2366ded 100644 --- a/tf-managed/modules/daily-snapshot/main.tf +++ b/tf-managed/modules/daily-snapshot/main.tf @@ -7,7 +7,7 @@ // Ugly hack because 'archive_file' cannot mix files and folders. data "external" "sources_tar" { - program = ["sh", "${path.module}/prep_sources.sh", path.module, var.common_resources_dir] + program = ["bash", "${path.module}/prep_sources.sh", path.module, var.common_resources_dir] } diff --git a/tf-managed/modules/sync-check/main.tf b/tf-managed/modules/sync-check/main.tf index 837cf4572..eb8595f3f 100644 --- a/tf-managed/modules/sync-check/main.tf +++ b/tf-managed/modules/sync-check/main.tf @@ -7,7 +7,7 @@ // Ugly hack because 'archive_file' cannot mix files and folders. data "external" "sources_tar" { - program = ["sh", "${path.module}/prep_sources.sh", path.module, var.common_resources_dir] + program = ["bash", "${path.module}/prep_sources.sh", path.module, var.common_resources_dir] } data "local_file" "sources" { From e81fcbaf858605f7e283ff3cdc45967bc2c3c255 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Tue, 16 Jan 2024 15:53:31 +0100 Subject: [PATCH 34/46] rm root scripts --- .github/workflows/scripts-lint.yml | 2 +- scripts/Gemfile | 7 ---- scripts/Gemfile.lock | 42 ---------------------- scripts/install-new-relic.sh | 46 ------------------------ scripts/ruby_common/docker_utils.rb | 12 ------- scripts/ruby_common/slack_client.rb | 56 ----------------------------- scripts/ruby_common/utils.rb | 9 ----- 7 files changed, 1 insertion(+), 173 deletions(-) delete mode 100644 scripts/Gemfile delete mode 100644 scripts/Gemfile.lock delete mode 100755 scripts/install-new-relic.sh delete mode 100644 scripts/ruby_common/docker_utils.rb delete mode 100644 scripts/ruby_common/slack_client.rb delete mode 100644 scripts/ruby_common/utils.rb diff --git a/.github/workflows/scripts-lint.yml b/.github/workflows/scripts-lint.yml index fec9d298f..5fdd9fc4b 100644 --- a/.github/workflows/scripts-lint.yml +++ b/.github/workflows/scripts-lint.yml @@ -29,7 +29,7 @@ jobs: - name: Run rubocop run: | gem install rubocop --no-document - rubocop scripts/ # TODO: Apply rubocop to terraform/modules/ + rubocop tf-managed/scripts/ # TODO: Apply rubocop to terraform/modules/ run-js-linters: runs-on: ubuntu-latest steps: diff --git a/scripts/Gemfile b/scripts/Gemfile deleted file mode 100644 index c90bdd858..000000000 --- a/scripts/Gemfile +++ /dev/null @@ -1,7 +0,0 @@ -# frozen_string_literal: true - -source 'https://rubygems.org' - -gem 'docker-api', '>= 2.2.0' -gem 'slack-ruby-client', '>= 2.1.0' -gem 'sys-filesystem', '>= 1.4.3' diff --git a/scripts/Gemfile.lock b/scripts/Gemfile.lock deleted file mode 100644 index 7c8b69809..000000000 --- a/scripts/Gemfile.lock +++ /dev/null @@ -1,42 +0,0 @@ -GEM - remote: https://rubygems.org/ - specs: - did_you_mean (1.6.3) - docker-api (2.2.0) - excon (>= 0.47.0) - multi_json - excon (0.99.0) - faraday (2.7.4) - faraday-net_http (>= 2.0, < 3.1) - ruby2_keywords (>= 0.0.4) - faraday-mashify (0.1.1) - faraday (~> 2.0) - hashie - faraday-multipart (1.0.4) - multipart-post (~> 2) - faraday-net_http (3.0.2) - ffi (1.15.5) - gli (2.21.0) - hashie (5.0.0) - multi_json (1.15.0) - multipart-post (2.3.0) - ruby2_keywords (0.0.5) - slack-ruby-client (2.1.0) - faraday (>= 2.0) - faraday-mashify - faraday-multipart - gli - hashie - sys-filesystem (1.4.3) - ffi (~> 1.1) - -PLATFORMS - x86_64-linux - -DEPENDENCIES - docker-api (>= 2.2.0) - slack-ruby-client (>= 2.1.0) - sys-filesystem (>= 1.4.3) - -BUNDLED WITH - 2.3.4 diff --git a/scripts/install-new-relic.sh b/scripts/install-new-relic.sh deleted file mode 100755 index 2d2884156..000000000 --- a/scripts/install-new-relic.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - -# This script offers an easy way to install the New Relic infrastructure agent for -# basic monitoring on Ubuntu systems, without needing administrative privileges. -# To get started, simply set your New Relic license key with the command export NR_LICENSE_KEY=your_license_key_here. - -set -euo pipefail - -# Setting DEBIAN_FRONTEND to ensure non-interactive operations for APT -export DEBIAN_FRONTEND=noninteractive - -# Add New Relic's apt repository -curl -fsSL https://download.newrelic.com/infrastructure_agent/gpg/newrelic-infra.gpg | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/newrelic-infra.gpg -echo "deb https://download.newrelic.com/infrastructure_agent/linux/apt focal main" | sudo tee -a /etc/apt/sources.list.d/newrelic-infra.list - -# Check if NR_LICENSE_KEY is set, if not ask for it -if [[ -z "${NR_LICENSE_KEY:-}" ]]; then - read -rp "Please enter your NR_LICENSE_KEY: " NR_LICENSE_KEY -fi - -# Update the package list -sudo apt-get update - -# The provided configurations are specific to New Relic. To gain a deeper understanding of these configuration details, you can visit: -# https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent/configuration/infrastructure-agent-configuration-settings/#offline-time-to-reset -cat >> /etc/newrelic-infra.yml < Date: Wed, 17 Jan 2024 14:11:29 +0100 Subject: [PATCH 35/46] tinker more --- composite-action/terragrunt/action.yml | 154 ++++++++++++------------- 1 file changed, 76 insertions(+), 78 deletions(-) diff --git a/composite-action/terragrunt/action.yml b/composite-action/terragrunt/action.yml index 569d3312f..cf5eeb640 100644 --- a/composite-action/terragrunt/action.yml +++ b/composite-action/terragrunt/action.yml @@ -42,36 +42,40 @@ runs: echo "tf_version=1.6.6" >> $GITHUB_ENV echo "tg_version=0.53.2" >> $GITHUB_ENV - - name: Check terragrunt HCL - uses: gruntwork-io/terragrunt-action@v2 + - name: Setup Terraform + uses: hashicorp/setup-terraform@v2 with: - tf_version: ${{ env.tf_version }} - tg_version: ${{ env.tg_version }} - tg_dir: ${{ inputs.working_directory }} - tg_command: 'hclfmt --terragrunt-check --terragrunt-diff' + terraform_version: v${{ env.tf_version }} + terraform_wrapper: false + + - name: Setup Terragrunt + shell: bash + run: | + sudo wget -q -O /bin/terragrunt "https://github.com/gruntwork-io/terragrunt/releases/download/v${{ env.tg_version }}/terragrunt_linux_amd64" + sudo chmod +x /bin/terragrunt + terragrunt -v + + - name: Check terragrunt HCL + working-directory: ${{ inputs.working_directory }} + shell: bash + run: | + terragrunt hclfmt --terragrunt-check --terragrunt-diff - name: Validate - uses: gruntwork-io/terragrunt-action@v2 - with: - tf_version: ${{ env.tf_version }} - tg_version: ${{ env.tg_version }} - tg_dir: ${{ inputs.working_directory }} - tg_command: 'validate' + working-directory: ${{ inputs.working_directory }} + shell: bash + run: | + terragrunt validate env: AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} - name: Plan if: github.event_name == 'pull_request' - uses: gruntwork-io/terragrunt-action@v2 id: plan - with: - tf_version: ${{ env.tf_version }} - tg_version: ${{ env.tg_version }} - tg_dir: ${{ inputs.working_directory }} - tg_command: 'plan -no-color' - tg_comment: 1 + working-directory: ${{ inputs.working_directory }} continue-on-error: true + shell: bash env: AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} @@ -83,18 +87,11 @@ runs: TF_VAR_slack_token: ${{ inputs.slack_token }} TF_VAR_new_relic_api_key: ${{ inputs.new_relic_api_key }} TF_VAR_new_relic_account_id: ${{ inputs.new_relic_account_id }} - - - name: Plan output cleanup - if: always() - shell: bash run: | - TG_OUT=$(echo '${{ steps.plan.outputs.tg_action_output }}' | sed 's|%0A|\n|g ; s|%3C|<|g') - echo "TG_PLAN_OUTPUT<> $GITHUB_ENV - echo "${TG_OUT:0:65300}" >> $GITHUB_ENV - echo "EOF" >> $GITHUB_ENV + terragrunt plan -no-color --terragrunt-non-interactive -out ${{ github.workspace }}/tfplan - name: Find Comment - if: github.event.pull_request.draft == true && + if: github.event.pull_request.draft == false && github.event_name == 'pull_request' uses: peter-evans/find-comment@v2 id: fc @@ -107,7 +104,7 @@ runs: - name: Create or Update Comment if: github.event.pull_request.draft == true && github.event_name == 'pull_request' && - !contains(env.TG_PLAN_OUTPUT, 'No changes. Your infrastructure matches the configuration.') + !contains(steps.plan.outputs.stdout, 'No changes. Your infrastructure matches the configuration.') uses: peter-evans/create-or-update-comment@v2 with: comment-id: ${{ steps.fc.outputs.comment-id }} @@ -118,7 +115,7 @@ runs:
Show Plan ``` - ${{ env.TG_PLAN_OUTPUT }} + ${{ steps.plan.outputs.stdout }} ```
@@ -128,56 +125,57 @@ runs: uses: detomarco/delete-comments@v1.0.4 if: github.event.pull_request.draft == true && github.event_name == 'pull_request' && - contains(env.TG_PLAN_OUTPUT, 'No changes. Your infrastructure matches the configuration.') + contains(steps.plan.outputs.stdout, 'No changes. Your infrastructure matches the configuration.') with: comment-id: ${{ steps.fc.outputs.comment-id }} - - name: Terraform Plan Status + - name: Terragrunt Plan Status shell: bash - if: steps.plan.tg_action_exit_code != 0 + if: steps.plan.outcome == 'failure' run: exit 1 - # - # - name: Terraform Apply - # if: github.ref == 'refs/heads/main' && github.event_name == 'push' - # run: | - # if grep -q 'No changes.' tfplan; then - # echo "No changes detected." - # else - # echo "Changes detected. Redeploying everything..." - # terraform destroy -auto-approve -input=false - # terraform apply -auto-approve -input=false - # fi - # shell: bash - # working-directory: ${{ inputs.working_directory }} - # env: - # TF_VAR_do_token: ${{ inputs.do_token }} - # TF_VAR_AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} - # TF_VAR_AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} - # AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} - # AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} - # TF_VAR_slack_token: ${{ inputs.slack_token }} - # TF_VAR_R2_ACCESS_KEY: ${{ inputs.r2_access_key }} - # TF_VAR_R2_SECRET_KEY: ${{ inputs.r2_secret_key }} - # TF_VAR_NEW_RELIC_API_KEY: ${{ inputs.NEW_RELIC_API_KEY }} - # TF_VAR_NR_LICENSE_KEY: ${{ inputs.NR_LICENSE_KEY }} - # TF_VAR_NEW_RELIC_ACCOUNT_ID: ${{ inputs.new_relic_account_id }} - # - # - name: Terraform Force Apply - # if: github.ref == 'refs/heads/main' && github.event_name == 'workflow_dispatch' - # shell: bash - # working-directory: ${{ inputs.working_directory }} - # env: - # TF_VAR_do_token: ${{ inputs.do_token }} - # TF_VAR_AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} - # TF_VAR_AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} - # AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} - # AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} - # TF_VAR_R2_ACCESS_KEY: ${{ inputs.r2_access_key }} - # TF_VAR_R2_SECRET_KEY: ${{ inputs.r2_secret_key }} - # TF_VAR_slack_token: ${{ inputs.slack_token }} - # TF_VAR_NEW_RELIC_API_KEY: ${{ inputs.new_relic_api_key }} - # TF_VAR_NR_LICENSE_KEY: ${{ inputs.nr_license_key }} - # TF_VAR_NEW_RELIC_ACCOUNT_ID: ${{ inputs.new_relic_account_id }} - # run: | - # terraform destroy -auto-approve -input=false - # terraform apply -auto-approve -input=false + + - name: Terragrunt Apply + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + run: | + if grep -q 'No changes.' tfplan; then + echo "No changes detected." + else + echo "Changes detected. Redeploying everything..." + terragrunt destroy -auto-approve --terragrunt-non-interactive + terragrunt apply -auto-approve --terragrunt-non-interactive + fi + shell: bash + working-directory: ${{ inputs.working_directory }} + env: + TF_VAR_do_token: ${{ inputs.do_token }} + TF_VAR_AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} + TF_VAR_AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} + AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} + AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} + TF_VAR_slack_token: ${{ inputs.slack_token }} + TF_VAR_R2_ACCESS_KEY: ${{ inputs.r2_access_key }} + TF_VAR_R2_SECRET_KEY: ${{ inputs.r2_secret_key }} + TF_VAR_NEW_RELIC_API_KEY: ${{ inputs.NEW_RELIC_API_KEY }} + TF_VAR_NR_LICENSE_KEY: ${{ inputs.NR_LICENSE_KEY }} + TF_VAR_NEW_RELIC_ACCOUNT_ID: ${{ inputs.new_relic_account_id }} + + - name: Terragrunt Force Apply + if: github.ref == 'refs/heads/main' && github.event_name == 'workflow_dispatch' + shell: bash + working-directory: ${{ inputs.working_directory }} + env: + TF_VAR_do_token: ${{ inputs.do_token }} + TF_VAR_AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} + TF_VAR_AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} + AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} + AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} + TF_VAR_R2_ACCESS_KEY: ${{ inputs.r2_access_key }} + TF_VAR_R2_SECRET_KEY: ${{ inputs.r2_secret_key }} + TF_VAR_slack_token: ${{ inputs.slack_token }} + TF_VAR_NEW_RELIC_API_KEY: ${{ inputs.new_relic_api_key }} + TF_VAR_NR_LICENSE_KEY: ${{ inputs.nr_license_key }} + TF_VAR_NEW_RELIC_ACCOUNT_ID: ${{ inputs.new_relic_account_id }} + run: | + terragrunt destroy -auto-approve --terragrunt-non-interactive + terragrunt apply -auto-approve --terragrunt-non-interactive + From 4a4a459b443c587e81c8153b5aa3154cc078e75f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 17 Jan 2024 14:10:38 +0000 Subject: [PATCH 36/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- composite-action/terragrunt/action.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/composite-action/terragrunt/action.yml b/composite-action/terragrunt/action.yml index cf5eeb640..ab1bbd9eb 100644 --- a/composite-action/terragrunt/action.yml +++ b/composite-action/terragrunt/action.yml @@ -178,4 +178,3 @@ runs: run: | terragrunt destroy -auto-approve --terragrunt-non-interactive terragrunt apply -auto-approve --terragrunt-non-interactive - From b42ff22419a08df42b29c4dd3bb7d54934add3fd Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Wed, 17 Jan 2024 15:16:06 +0100 Subject: [PATCH 37/46] tinker more --- .github/workflows/deploy-daily-snapshot.yml | 5 +- .github/workflows/deploy-sync-check.yml | 33 +++++++----- composite-action/terragrunt/action.yml | 58 +++++++++++---------- 3 files changed, 51 insertions(+), 45 deletions(-) diff --git a/.github/workflows/deploy-daily-snapshot.yml b/.github/workflows/deploy-daily-snapshot.yml index 332961d56..96dafb0d1 100644 --- a/.github/workflows/deploy-daily-snapshot.yml +++ b/.github/workflows/deploy-daily-snapshot.yml @@ -1,4 +1,4 @@ -name: Deploy Snapshot Service +name: Snapshot Service concurrency: ci-${{ github.ref }} on: @@ -23,7 +23,6 @@ on: jobs: deploy-daily-snapshot: - name: Deploy runs-on: ubuntu-latest permissions: write-all steps: @@ -41,6 +40,6 @@ jobs: r2_secret_key: ${{ secrets.R2_SECRET_KEY }} slack_token: ${{ secrets.SLACK_TOKEN }} working_directory: tf-managed/live/environments/prod/applications/snapshot-service - environment: Snapshot Service + service_name: Snapshot Service new_relic_account_id: ${{ secrets.NEW_RELIC_ACCOUNT_ID }} new_relic_api_key: ${{ secrets.NEW_RELIC_API_KEY }} diff --git a/.github/workflows/deploy-sync-check.yml b/.github/workflows/deploy-sync-check.yml index 9f1b1723c..c80051dcd 100644 --- a/.github/workflows/deploy-sync-check.yml +++ b/.github/workflows/deploy-sync-check.yml @@ -5,36 +5,41 @@ on: pull_request: branches: - main - paths: - - 'terraform/sync_check/**' - - 'terraform/modules/sync_check/**' + # paths: + # - 'tf-managed/modules/sync-check/**' + # - 'tf-managed/scripts/**' + # - 'tf-managed/live/environments/prod/applications/snapshot-service' + # This needs to be declared explicitly so that the job is actually + # run when moved out of draft. + types: [opened, synchronize, reopened, ready_for_review] push: branches: - main - paths: - - 'terraform/sync_check/**' - - 'terraform/modules/sync_check/**' + # paths: + # - 'tf-managed/modules/sync-check/**' + # - 'tf-managed/scripts/**' + # - 'tf-managed/live/environments/prod/applications/snapshot-service' workflow_dispatch: jobs: - sync-check: - name: Deploy + deploy-sync-check: runs-on: ubuntu-latest permissions: write-all steps: - name: Checkout the code uses: actions/checkout@v4 - # Using Custom Composite action in ./composite-action/terraform folder - - name: Composite Action for Deploying Terraform Resources - uses: ./composite-action/terraform + # Using Custom Composite action in ./composite-action/terragrunt folder + - name: Composite Action for Deploying Terragrunt Resources + uses: ./composite-action/terragrunt with: do_token: ${{ secrets.DO_TOKEN }} aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} + r2_access_key: ${{ secrets.R2_ACCESS_KEY }} + r2_secret_key: ${{ secrets.R2_SECRET_KEY }} slack_token: ${{ secrets.SLACK_TOKEN }} - working_directory: terraform/sync_check - environment: Sync Check Service + working_directory: tf-managed/live/environments/prod/applications/sync-check + service_name: Snapshot Service new_relic_account_id: ${{ secrets.NEW_RELIC_ACCOUNT_ID }} new_relic_api_key: ${{ secrets.NEW_RELIC_API_KEY }} diff --git a/composite-action/terragrunt/action.yml b/composite-action/terragrunt/action.yml index ab1bbd9eb..64aad5c4a 100644 --- a/composite-action/terragrunt/action.yml +++ b/composite-action/terragrunt/action.yml @@ -4,8 +4,8 @@ description: | This action deploys the Forest infrastructure with Terragrunt inputs: - environment: - description: 'The terraform plan for the the environment infrastructure to be deployed' + service_name: + description: 'Human-readable name of the service' required: true do_token: description: 'The DigitalOcean access token to use for deploying the infrastructure' @@ -46,6 +46,7 @@ runs: uses: hashicorp/setup-terraform@v2 with: terraform_version: v${{ env.tf_version }} + # This is required for Terragrunt to parse Terraform outputs. terraform_wrapper: false - name: Setup Terragrunt @@ -56,14 +57,14 @@ runs: terragrunt -v - name: Check terragrunt HCL - working-directory: ${{ inputs.working_directory }} shell: bash + working-directory: ${{ inputs.working_directory }} run: | terragrunt hclfmt --terragrunt-check --terragrunt-diff - name: Validate - working-directory: ${{ inputs.working_directory }} shell: bash + working-directory: ${{ inputs.working_directory }} run: | terragrunt validate env: @@ -73,9 +74,9 @@ runs: - name: Plan if: github.event_name == 'pull_request' id: plan + shell: bash working-directory: ${{ inputs.working_directory }} continue-on-error: true - shell: bash env: AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} @@ -88,7 +89,10 @@ runs: TF_VAR_new_relic_api_key: ${{ inputs.new_relic_api_key }} TF_VAR_new_relic_account_id: ${{ inputs.new_relic_account_id }} run: | - terragrunt plan -no-color --terragrunt-non-interactive -out ${{ github.workspace }}/tfplan + terragrunt plan -no-color --terragrunt-non-interactive -out ${{ github.workspace }}/tfplan | tee output + echo 'plan<> $GITHUB_OUTPUT + cat output >> $GITHUB_OUTPUT + echo 'EOF' >> $GITHUB_OUTPUT - name: Find Comment if: github.event.pull_request.draft == false && @@ -98,24 +102,24 @@ runs: with: issue-number: ${{ github.event.pull_request.number }} comment-author: 'github-actions[bot]' - body-regex: "^### Forest: ${{ inputs.environment }} Infrastructure Plan" + body-regex: "^### Forest: ${{ inputs.service_name }} Infrastructure Plan" - name: Create or Update Comment if: github.event.pull_request.draft == true && github.event_name == 'pull_request' && - !contains(steps.plan.outputs.stdout, 'No changes. Your infrastructure matches the configuration.') + !contains(steps.plan.outputs.plan, 'No changes. Your infrastructure matches the configuration.') uses: peter-evans/create-or-update-comment@v2 with: comment-id: ${{ steps.fc.outputs.comment-id }} issue-number: ${{ github.event.pull_request.number }} body: | - ### Forest: ${{ inputs.environment }} Infrastructure Plan: ${{ steps.plan.outcome }} + ### Forest: ${{ inputs.service_name }} Infrastructure Plan: ${{ steps.plan.outcome }}
Show Plan ``` - ${{ steps.plan.outputs.stdout }} + ${{ steps.plan.outputs.plan }} ```
@@ -125,56 +129,54 @@ runs: uses: detomarco/delete-comments@v1.0.4 if: github.event.pull_request.draft == true && github.event_name == 'pull_request' && - contains(steps.plan.outputs.stdout, 'No changes. Your infrastructure matches the configuration.') + contains(steps.plan.outputs.plan, 'No changes. Your infrastructure matches the configuration.') with: comment-id: ${{ steps.fc.outputs.comment-id }} - name: Terragrunt Plan Status - shell: bash if: steps.plan.outcome == 'failure' + shell: bash run: exit 1 - name: Terragrunt Apply if: github.ref == 'refs/heads/main' && github.event_name == 'push' + shell: bash run: | - if grep -q 'No changes.' tfplan; then + if grep -q 'No changes.' ${{ github.workspace }}/tfplan; then echo "No changes detected." else echo "Changes detected. Redeploying everything..." terragrunt destroy -auto-approve --terragrunt-non-interactive terragrunt apply -auto-approve --terragrunt-non-interactive fi - shell: bash working-directory: ${{ inputs.working_directory }} env: - TF_VAR_do_token: ${{ inputs.do_token }} - TF_VAR_AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} - TF_VAR_AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} - TF_VAR_slack_token: ${{ inputs.slack_token }} + TF_VAR_digitalocean_token: ${{ inputs.do_token }} + TF_VAR_AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} + TF_VAR_AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} TF_VAR_R2_ACCESS_KEY: ${{ inputs.r2_access_key }} TF_VAR_R2_SECRET_KEY: ${{ inputs.r2_secret_key }} - TF_VAR_NEW_RELIC_API_KEY: ${{ inputs.NEW_RELIC_API_KEY }} - TF_VAR_NR_LICENSE_KEY: ${{ inputs.NR_LICENSE_KEY }} - TF_VAR_NEW_RELIC_ACCOUNT_ID: ${{ inputs.new_relic_account_id }} + TF_VAR_slack_token: ${{ inputs.slack_token }} + TF_VAR_new_relic_api_key: ${{ inputs.new_relic_api_key }} + TF_VAR_new_relic_account_id: ${{ inputs.new_relic_account_id }} - name: Terragrunt Force Apply if: github.ref == 'refs/heads/main' && github.event_name == 'workflow_dispatch' - shell: bash working-directory: ${{ inputs.working_directory }} env: - TF_VAR_do_token: ${{ inputs.do_token }} - TF_VAR_AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} - TF_VAR_AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} + TF_VAR_digitalocean_token: ${{ inputs.do_token }} + TF_VAR_AWS_ACCESS_KEY_ID: ${{ inputs.aws_access_key_id }} + TF_VAR_AWS_SECRET_ACCESS_KEY: ${{ inputs.aws_secret_access_key }} TF_VAR_R2_ACCESS_KEY: ${{ inputs.r2_access_key }} TF_VAR_R2_SECRET_KEY: ${{ inputs.r2_secret_key }} TF_VAR_slack_token: ${{ inputs.slack_token }} - TF_VAR_NEW_RELIC_API_KEY: ${{ inputs.new_relic_api_key }} - TF_VAR_NR_LICENSE_KEY: ${{ inputs.nr_license_key }} - TF_VAR_NEW_RELIC_ACCOUNT_ID: ${{ inputs.new_relic_account_id }} + TF_VAR_new_relic_api_key: ${{ inputs.new_relic_api_key }} + TF_VAR_new_relic_account_id: ${{ inputs.new_relic_account_id }} + shell: bash run: | terragrunt destroy -auto-approve --terragrunt-non-interactive terragrunt apply -auto-approve --terragrunt-non-interactive From e77ea4443fe43df5bc9be49c5814cfe064385d92 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Wed, 17 Jan 2024 18:06:51 +0100 Subject: [PATCH 38/46] bump --- .github/workflows/deploy-sync-check.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy-sync-check.yml b/.github/workflows/deploy-sync-check.yml index c80051dcd..922ef4930 100644 --- a/.github/workflows/deploy-sync-check.yml +++ b/.github/workflows/deploy-sync-check.yml @@ -40,6 +40,6 @@ jobs: r2_secret_key: ${{ secrets.R2_SECRET_KEY }} slack_token: ${{ secrets.SLACK_TOKEN }} working_directory: tf-managed/live/environments/prod/applications/sync-check - service_name: Snapshot Service + service_name: Sync Check Service new_relic_account_id: ${{ secrets.NEW_RELIC_ACCOUNT_ID }} new_relic_api_key: ${{ secrets.NEW_RELIC_API_KEY }} From f3a0dd43375f9b4e8036468784b3dca23ed07eab Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Wed, 17 Jan 2024 18:25:39 +0100 Subject: [PATCH 39/46] tinker --- composite-action/terragrunt/action.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/composite-action/terragrunt/action.yml b/composite-action/terragrunt/action.yml index 64aad5c4a..f72ce4ff7 100644 --- a/composite-action/terragrunt/action.yml +++ b/composite-action/terragrunt/action.yml @@ -90,12 +90,12 @@ runs: TF_VAR_new_relic_account_id: ${{ inputs.new_relic_account_id }} run: | terragrunt plan -no-color --terragrunt-non-interactive -out ${{ github.workspace }}/tfplan | tee output - echo 'plan<> $GITHUB_OUTPUT + echo 'stdout<> $GITHUB_OUTPUT cat output >> $GITHUB_OUTPUT echo 'EOF' >> $GITHUB_OUTPUT - name: Find Comment - if: github.event.pull_request.draft == false && + if: github.event.pull_request.draft == true && github.event_name == 'pull_request' uses: peter-evans/find-comment@v2 id: fc @@ -108,7 +108,7 @@ runs: - name: Create or Update Comment if: github.event.pull_request.draft == true && github.event_name == 'pull_request' && - !contains(steps.plan.outputs.plan, 'No changes. Your infrastructure matches the configuration.') + !contains(steps.plan.outputs.stdout, 'No changes. Your infrastructure matches the configuration.') uses: peter-evans/create-or-update-comment@v2 with: comment-id: ${{ steps.fc.outputs.comment-id }} @@ -119,7 +119,7 @@ runs:
Show Plan ``` - ${{ steps.plan.outputs.plan }} + ${{ steps.plan.outputs.stdout }} ```
@@ -129,7 +129,7 @@ runs: uses: detomarco/delete-comments@v1.0.4 if: github.event.pull_request.draft == true && github.event_name == 'pull_request' && - contains(steps.plan.outputs.plan, 'No changes. Your infrastructure matches the configuration.') + contains(steps.plan.outputs.stdout, 'No changes. Your infrastructure matches the configuration.') with: comment-id: ${{ steps.fc.outputs.comment-id }} From 420a4c724522e1e9eefbd4f2fe9c2dcc33f17b67 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Thu, 18 Jan 2024 10:44:53 +0100 Subject: [PATCH 40/46] bump From 171f09464841d71a33e92389200f80a80b3be826 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Thu, 18 Jan 2024 10:50:05 +0100 Subject: [PATCH 41/46] bump From d9631247a438615611dc02102939644925979ecb Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Thu, 18 Jan 2024 18:09:35 +0100 Subject: [PATCH 42/46] cleanup log-based alerts --- tf-managed/modules/daily-snapshot/main.tf | 1 - .../daily-snapshot/monitoring/event_rules.tf | 28 ----- .../modules/daily-snapshot/monitoring/main.tf | 117 ++++-------------- .../daily-snapshot/monitoring/variable.tf | 5 - 4 files changed, 25 insertions(+), 126 deletions(-) delete mode 100644 tf-managed/modules/daily-snapshot/monitoring/event_rules.tf diff --git a/tf-managed/modules/daily-snapshot/main.tf b/tf-managed/modules/daily-snapshot/main.tf index fd2366ded..c931fb43b 100644 --- a/tf-managed/modules/daily-snapshot/main.tf +++ b/tf-managed/modules/daily-snapshot/main.tf @@ -113,5 +113,4 @@ module "monitoring" { slack_enable = var.monitoring.slack_enable slack_destination_id = var.monitoring.slack_destination_id slack_channel_id = var.monitoring.slack_channel_id - new_relic_account_id = var.new_relic_account_id } diff --git a/tf-managed/modules/daily-snapshot/monitoring/event_rules.tf b/tf-managed/modules/daily-snapshot/monitoring/event_rules.tf deleted file mode 100644 index a7381dada..000000000 --- a/tf-managed/modules/daily-snapshot/monitoring/event_rules.tf +++ /dev/null @@ -1,28 +0,0 @@ -# This file constains NR event rules used to generate metrics from logs, given that -# the service is not generating metrics by itself. -resource "newrelic_events_to_metrics_rule" "generate_snapshot_attempt_metrics" { - account_id = var.new_relic_account_id - for_each = toset(["mainnet", "calibnet"]) - - name = format("%s %s snapshot generation attempts", var.service_name, each.key) - description = "Snapshot generation attempts" - nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_run' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%running snapshot export%'" -} - -resource "newrelic_events_to_metrics_rule" "generate_snapshot_success_metrics" { - account_id = var.new_relic_account_id - for_each = toset(["mainnet", "calibnet"]) - - name = format("%s %s snapshot generation success", var.service_name, each.key) - description = "Success snapshot generations" - nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_ok' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%Snapshot uploaded for%'" -} - -resource "newrelic_events_to_metrics_rule" "generate_snapshot_fail_metrics" { - account_id = var.new_relic_account_id - for_each = toset(["mainnet", "calibnet"]) - - name = format("%s %s snapshot generation failure", var.service_name, each.key) - description = "Failed snapshot generations" - nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_fail' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%Snapshot upload failed for%'" -} diff --git a/tf-managed/modules/daily-snapshot/monitoring/main.tf b/tf-managed/modules/daily-snapshot/monitoring/main.tf index 88ff815c6..71197e3ac 100644 --- a/tf-managed/modules/daily-snapshot/monitoring/main.tf +++ b/tf-managed/modules/daily-snapshot/monitoring/main.tf @@ -6,33 +6,31 @@ locals { enable_email = var.alert_email != "" } -# resource "newrelic_nrql_alert_condition" "disk_space" { -# policy_id = newrelic_alert_policy.alert.id -# type = "static" -# name = "High Disk Utilization" -# description = "Alert when disk space usage is high on an the service host" -# enabled = true -# -# nrql { -# query = "SELECT latest(diskUsedPercent) FROM StorageSample where entityName = '${var.service_name}'" -# } -# -# critical { -# operator = "above" -# # threshold = 85.0 -# threshold = 20.0 -# threshold_duration = 300 -# threshold_occurrences = "ALL" -# } -# -# warning { -# operator = "above" -# # threshold = 70.0 -# threshold = 10.0 -# threshold_duration = 300 -# threshold_occurrences = "ALL" -# } -# } +resource "newrelic_nrql_alert_condition" "disk_space" { + policy_id = newrelic_alert_policy.alert.id + type = "static" + name = "High Disk Utilization" + description = "Alert when disk space usage is high on an the service host" + enabled = true + + nrql { + query = "SELECT latest(diskUsedPercent) FROM StorageSample where entityName = '${var.service_name}'" + } + + critical { + operator = "above" + threshold = 85.0 + threshold_duration = 300 + threshold_occurrences = "ALL" + } + + warning { + operator = "above" + threshold = 70.0 + threshold_duration = 300 + threshold_occurrences = "ALL" + } +} resource "newrelic_notification_destination" "email" { count = local.enable_email ? 1 : 0 @@ -120,68 +118,3 @@ resource "newrelic_workflow" "alerting-workflow-slack" { channel_id = newrelic_notification_channel.slack-channel[0].id } } - -# At least 1 snapshot is generated in 5 hours interval -resource "newrelic_nrql_alert_condition" "snapshot_frequency_condition" { - for_each = toset(["mainnet", "calibnet"]) - policy_id = newrelic_alert_policy.alert.id - type = "static" - name = format("Low snapshot generation frequency - %s", each.key) - description = "Alert when snapshots are not generated within requried time interval" - enabled = true - - # evaluation_delay = 7200 # 2 hours, it may take some time to generate a snapshot - # aggregation_window = 14400 # 4 hours, it may take some time to generate a snapshot - aggregation_window = 360 # 4 hours, it may take some time to generate a snapshot - - - nrql { - query = format("FROM Metric SELECT count(`${var.service_name}.${each.key}.snapshot_generation_ok`)") - } - - warning { - operator = "below" - threshold = 1 - # threshold_duration = 14400 - threshold_duration = 360 - threshold_occurrences = "ALL" - } - - critical { - operator = "below" - threshold = 1 - # threshold_duration = 28800 - threshold_duration = 720 - threshold_occurrences = "ALL" - } -} - -# At least 1 successful snapshot out of 3 attempts - -#resource "newrelic_nrql_alert_condition" "disk_space" { -# policy_id = newrelic_alert_policy.alert.id -# type = "static" -# name = "High Disk Utilization" -# description = "Alert when disk space usage is high on an the service host" -# enabled = true -# -# nrql { -# query = "SELECT latest(diskUsedPercent) FROM StorageSample where entityName = '${var.service_name}'" -# } -# -# critical { -# operator = "above" -# # threshold = 85.0 -# threshold = 20.0 -# threshold_duration = 300 -# threshold_occurrences = "ALL" -# } -# -# warning { -# operator = "above" -# # threshold = 70.0 -# threshold = 10.0 -# threshold_duration = 300 -# threshold_occurrences = "ALL" -# } -#} diff --git a/tf-managed/modules/daily-snapshot/monitoring/variable.tf b/tf-managed/modules/daily-snapshot/monitoring/variable.tf index 484be045e..76499aa60 100644 --- a/tf-managed/modules/daily-snapshot/monitoring/variable.tf +++ b/tf-managed/modules/daily-snapshot/monitoring/variable.tf @@ -24,8 +24,3 @@ variable "slack_channel_id" { description = "Slack channel id" type = string } - -variable "new_relic_account_id" { - description = "New Relic account id" - type = number -} From f93ef18f169d67ff2bcda02f4fbd86b6a5be9ba2 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Thu, 18 Jan 2024 19:21:19 +0100 Subject: [PATCH 43/46] tinker --- .github/workflows/deploy-daily-snapshot.yml | 7 +++++-- .github/workflows/deploy-sync-check.yml | 5 +++-- composite-action/terragrunt/action.yml | 10 +++++++++- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/.github/workflows/deploy-daily-snapshot.yml b/.github/workflows/deploy-daily-snapshot.yml index 96dafb0d1..dbc08e038 100644 --- a/.github/workflows/deploy-daily-snapshot.yml +++ b/.github/workflows/deploy-daily-snapshot.yml @@ -1,5 +1,5 @@ name: Snapshot Service -concurrency: ci-${{ github.ref }} +concurrency: ci-${{ github.ref }}-snapshot-service on: pull_request: @@ -23,6 +23,8 @@ on: jobs: deploy-daily-snapshot: + env: + TF_VAR_monitoring: "{ \"enable\": true,\"slack_enable\":true,\"slack_destination_id\":\"${{ secrets.SLACK_DESTINATION_ID }}\",\"slack_channel_id\":\"${{ secrets.SLACK_CHANNEL_ID }}\"}" runs-on: ubuntu-latest permissions: write-all steps: @@ -39,7 +41,8 @@ jobs: r2_access_key: ${{ secrets.R2_ACCESS_KEY }} r2_secret_key: ${{ secrets.R2_SECRET_KEY }} slack_token: ${{ secrets.SLACK_TOKEN }} - working_directory: tf-managed/live/environments/prod/applications/snapshot-service + working_directory: tf-managed/live/environments/dev/applications/snapshot-service service_name: Snapshot Service new_relic_account_id: ${{ secrets.NEW_RELIC_ACCOUNT_ID }} new_relic_api_key: ${{ secrets.NEW_RELIC_API_KEY }} + ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} diff --git a/.github/workflows/deploy-sync-check.yml b/.github/workflows/deploy-sync-check.yml index 922ef4930..cdc49ab88 100644 --- a/.github/workflows/deploy-sync-check.yml +++ b/.github/workflows/deploy-sync-check.yml @@ -1,5 +1,5 @@ name: Sync Check Service -concurrency: ci-${{ github.ref }} +concurrency: ci-${{ github.ref }}-sync-check on: pull_request: @@ -39,7 +39,8 @@ jobs: r2_access_key: ${{ secrets.R2_ACCESS_KEY }} r2_secret_key: ${{ secrets.R2_SECRET_KEY }} slack_token: ${{ secrets.SLACK_TOKEN }} - working_directory: tf-managed/live/environments/prod/applications/sync-check + working_directory: tf-managed/live/environments/dev/applications/sync-check service_name: Sync Check Service new_relic_account_id: ${{ secrets.NEW_RELIC_ACCOUNT_ID }} new_relic_api_key: ${{ secrets.NEW_RELIC_API_KEY }} + ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} diff --git a/composite-action/terragrunt/action.yml b/composite-action/terragrunt/action.yml index f72ce4ff7..93828fc6c 100644 --- a/composite-action/terragrunt/action.yml +++ b/composite-action/terragrunt/action.yml @@ -19,6 +19,8 @@ inputs: working_directory: description: 'The working Directory' required: true + ssh_private_key: + description: 'The SSH private key to use for connecting to Droplets via SSH' slack_token: description: 'The slack token secret used to connect the Infrastructure to Slack' new_relic_api_key: @@ -138,8 +140,14 @@ runs: shell: bash run: exit 1 + - name: Configure ssh-agent + # if: github.ref == 'refs/heads/main' && ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) + uses: webfactory/ssh-agent@v0.8.0 + with: + ssh-private-key: ${{ inputs.ssh_private_key }} + - name: Terragrunt Apply - if: github.ref == 'refs/heads/main' && github.event_name == 'push' + # if: github.ref == 'refs/heads/main' && github.event_name == 'push' shell: bash run: | if grep -q 'No changes.' ${{ github.workspace }}/tfplan; then From 4582f701dc3e33668815bdfa30d576e510fb0353 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Fri, 19 Jan 2024 11:40:31 +0100 Subject: [PATCH 44/46] self-review --- .github/workflows/deploy-daily-snapshot.yml | 18 +++++++++--------- .github/workflows/deploy-sync-check.yml | 18 +++++++++--------- .github/workflows/scripts-lint.yml | 2 +- composite-action/terragrunt/action.yml | 6 +++--- tf-managed/README.md | 15 ++++++++------- tf-managed/live/README.md | 18 +++++++++--------- .../snapshot-service/terragrunt.hcl | 2 +- .../modules/daily-snapshot/monitoring/main.tf | 4 ++-- .../service/newrelic_fail2ban.sh | 10 +++++----- tf-managed/modules/snapshot-monitoring/main.tf | 1 - 10 files changed, 47 insertions(+), 47 deletions(-) diff --git a/.github/workflows/deploy-daily-snapshot.yml b/.github/workflows/deploy-daily-snapshot.yml index dbc08e038..1b686daa3 100644 --- a/.github/workflows/deploy-daily-snapshot.yml +++ b/.github/workflows/deploy-daily-snapshot.yml @@ -5,20 +5,20 @@ on: pull_request: branches: - main - # paths: - # - 'tf-managed/modules/daily_snapshot/**' - # - 'tf-managed/scripts/**' - # - 'tf-managed/live/environments/prod/applications/snapshot-service' + paths: + - 'tf-managed/modules/daily_snapshot/**' + - 'tf-managed/scripts/**' + - 'tf-managed/live/environments/prod/applications/snapshot-service' # This needs to be declared explicitly so that the job is actually # run when moved out of draft. types: [opened, synchronize, reopened, ready_for_review] push: branches: - main - # paths: - # - 'tf-managed/modules/daily_snapshot/**' - # - 'tf-managed/scripts/**' - # - 'tf-managed/live/environments/prod/applications/snapshot-service' + paths: + - 'tf-managed/modules/daily_snapshot/**' + - 'tf-managed/scripts/**' + - 'tf-managed/live/environments/prod/applications/snapshot-service' workflow_dispatch: jobs: @@ -41,7 +41,7 @@ jobs: r2_access_key: ${{ secrets.R2_ACCESS_KEY }} r2_secret_key: ${{ secrets.R2_SECRET_KEY }} slack_token: ${{ secrets.SLACK_TOKEN }} - working_directory: tf-managed/live/environments/dev/applications/snapshot-service + working_directory: tf-managed/live/environments/prod/applications/snapshot-service service_name: Snapshot Service new_relic_account_id: ${{ secrets.NEW_RELIC_ACCOUNT_ID }} new_relic_api_key: ${{ secrets.NEW_RELIC_API_KEY }} diff --git a/.github/workflows/deploy-sync-check.yml b/.github/workflows/deploy-sync-check.yml index cdc49ab88..4c1eda309 100644 --- a/.github/workflows/deploy-sync-check.yml +++ b/.github/workflows/deploy-sync-check.yml @@ -5,20 +5,20 @@ on: pull_request: branches: - main - # paths: - # - 'tf-managed/modules/sync-check/**' - # - 'tf-managed/scripts/**' - # - 'tf-managed/live/environments/prod/applications/snapshot-service' + paths: + - 'tf-managed/modules/sync-check/**' + - 'tf-managed/scripts/**' + - 'tf-managed/live/environments/prod/applications/sync-check' # This needs to be declared explicitly so that the job is actually # run when moved out of draft. types: [opened, synchronize, reopened, ready_for_review] push: branches: - main - # paths: - # - 'tf-managed/modules/sync-check/**' - # - 'tf-managed/scripts/**' - # - 'tf-managed/live/environments/prod/applications/snapshot-service' + paths: + - 'tf-managed/modules/sync-check/**' + - 'tf-managed/scripts/**' + - 'tf-managed/live/environments/prod/applications/sync-check' workflow_dispatch: jobs: @@ -39,7 +39,7 @@ jobs: r2_access_key: ${{ secrets.R2_ACCESS_KEY }} r2_secret_key: ${{ secrets.R2_SECRET_KEY }} slack_token: ${{ secrets.SLACK_TOKEN }} - working_directory: tf-managed/live/environments/dev/applications/sync-check + working_directory: tf-managed/live/environments/prod/applications/sync-check service_name: Sync Check Service new_relic_account_id: ${{ secrets.NEW_RELIC_ACCOUNT_ID }} new_relic_api_key: ${{ secrets.NEW_RELIC_API_KEY }} diff --git a/.github/workflows/scripts-lint.yml b/.github/workflows/scripts-lint.yml index 5fdd9fc4b..3898e1902 100644 --- a/.github/workflows/scripts-lint.yml +++ b/.github/workflows/scripts-lint.yml @@ -29,7 +29,7 @@ jobs: - name: Run rubocop run: | gem install rubocop --no-document - rubocop tf-managed/scripts/ # TODO: Apply rubocop to terraform/modules/ + rubocop tf-managed/scripts/ # TODO: Apply rubocop to tf-managed/modules/ run-js-linters: runs-on: ubuntu-latest steps: diff --git a/composite-action/terragrunt/action.yml b/composite-action/terragrunt/action.yml index 93828fc6c..f352dec87 100644 --- a/composite-action/terragrunt/action.yml +++ b/composite-action/terragrunt/action.yml @@ -97,7 +97,7 @@ runs: echo 'EOF' >> $GITHUB_OUTPUT - name: Find Comment - if: github.event.pull_request.draft == true && + if: github.event.pull_request.draft == false && github.event_name == 'pull_request' uses: peter-evans/find-comment@v2 id: fc @@ -108,7 +108,7 @@ runs: - name: Create or Update Comment - if: github.event.pull_request.draft == true && + if: github.event.pull_request.draft == false && github.event_name == 'pull_request' && !contains(steps.plan.outputs.stdout, 'No changes. Your infrastructure matches the configuration.') uses: peter-evans/create-or-update-comment@v2 @@ -129,7 +129,7 @@ runs: - name: Delete Comment uses: detomarco/delete-comments@v1.0.4 - if: github.event.pull_request.draft == true && + if: github.event.pull_request.draft == false && github.event_name == 'pull_request' && contains(steps.plan.outputs.stdout, 'No changes. Your infrastructure matches the configuration.') with: diff --git a/tf-managed/README.md b/tf-managed/README.md index 8c5b54028..3c57cf427 100644 --- a/tf-managed/README.md +++ b/tf-managed/README.md @@ -15,9 +15,9 @@ This directory contains services and assets managed via Terraform/Terragrunt. ### Software * [terraform](https://developer.hashicorp.com/terraform/install), -* [terraform](https://terragrunt.gruntwork.io/docs/getting-started/install/) +* [terragrunt](https://terragrunt.gruntwork.io/docs/getting-started/install/) -For recommended versions, please refer to the workflow files # TODO: Put file +For recommended versions, please refer to the [workflow file](../composite-action/terragrunt/action.yml). ### Secrets @@ -25,20 +25,21 @@ Refer to [environment README](./live/README.md) or module-specific README. # Adding new services -1. Create a Terraform module and put it in [modules](./modules). A suggested structure of such module is: +1. Create a Terraform module in [modules](./modules). A suggested structure of such a module is: * `main.tf` - the core resources around the service. * `variable.tf` - inputs to the module, e.g., enable Slack notifications. * `outputs.tf` - outputs of the module, e.g., created VPS IP. * `provider.tf` - `terraform` and `provider` blocks to keep the versioning in one place. - * `service/` - directory with the actual service implementation. This will be changed in the future. + * `service/` - directory with the actual service implementation. * Other files and directories based on needs, e.g., `monitoring` to generate monitoring resources. +Ensure that names in the module, when needed, contain the environment. This provides a basic level of separation. 2. Create a Terragrunt service in your own development environment and assert that it works correctly: * inside [live](./live), execute `make create-environment`. Go to that directory. - * inside the `applications/`, create your `fancy-app` directory and a `terragrunt.hcl` file. There you will invoke the created module with input variables. - * run `terragrunt plan` to assert that all variables are set correctly, and that the plan output matches your expectations, + * inside the `applications/`, create your `fancy-app` directory and a `terragrunt.hcl` file. There, you will invoke the created module with input variables. + * run `terragrunt plan` to assert that all variables are set correctly and that the plan output matches your expectations, * run `terragrunt apply` to apply the plan. - * perform necessary assertions (the resources are created, the server is responding to requests, monitoring outputs make sense). + * perform necessary assertions (the resources are created, the server responds to requests, and monitoring outputs make sense). * if all is good, teardown the service with `terragrunt destroy`. 3. Copy the tested service to [dev](./live/environments/dev/applications) and to [prod](./live/environments/prod/applications). Remove your environment directory. diff --git a/tf-managed/live/README.md b/tf-managed/live/README.md index b4fd11948..0c059def1 100644 --- a/tf-managed/live/README.md +++ b/tf-managed/live/README.md @@ -4,7 +4,7 @@ All Terragrunt configurations live here. To edit Terraform files, go to `../modu The Terragrunt configurations manage the actual environments and, in principle, should reflect the current state of the given environment. # Development -As a developer, you should create your own environment, separated from the others. In this directory, execute `make environment` and one will be created for you. Do not work on the `dev` environment directly as others may be working on it as well. +As a developer, you should create your own environment, separated from the others. In this directory, execute `make environment`, which will create one for you. Do not work on the `dev` environment directly, as others may also be working on it. ``` ❯ make create-environment @@ -15,33 +15,33 @@ Inside the specific application in the environment, run: ``` ❯ terragrunt plan ``` +This command will show you the resources to be changed/created/destroyed. -This should show you the resources to be changed/created/destroyed. ``` ❯ terragrunt apply ``` After ensuring the changes work correctly, merge the changes from your development environment to the base one and, possibly, `prod`. -Remember to cleanup your environment. Use `terragrunt destroy` or use `make destroy-dev`. Refer to the [Makefile](./Makefile) for details. +Remember to clean up your environment. Use `terragrunt destroy` or use `make destroy-dev`. Refer to the [Makefile](./Makefile) for details. # Conventions ## Environments -There is currently no notion of `staging` environment, though one may be introduced in the future. +There is no notion of a `staging` environment, though one may be introduced in the future. ``` . ├── dev # Development environment template for custom environments. ├── dev- # Personal development environment -└── prod # Production environment. Should reflect reality. +└── prod # Production environment. It should reflect reality. ``` -The `prod` environment should be deployed only by GH worker and not manually. +The `prod` environment should be deployed only by GH workers and not manually. -Each environment contains its respective `applications/`. A `base-infrastructure` may be created in the future to denote resources shared between applications. Each application should contain a single `terragrunt.hcl` file which only sets its configuration and, optionally, defines dependencies. The application code itself should be defined in `../modules`. +Each environment contains its respective `applications/`. A `base-infrastructure` may be created to denote resources shared between applications. Each application should include a single `terragrunt.hcl` file which only sets its configuration and, optionally, defines dependencies. The application code itself should be defined in `../modules`. ``` @@ -54,7 +54,7 @@ Each environment contains its respective `applications/`. A `base-infrastructure └── terragrunt.hcl ``` -The difference between a `prod` and a `dev` application should be minimal. This would include a different Slack notification channel (which is already handled by the root `terragrunt.hcl`) or using a larger instances for `prod` environment. +The difference between a `prod` and a `dev` application should be minimal. This would include a different Slack notification channel (already handled by the root `terragrunt.hcl`) or using larger instances for the `prod` environment. ## Tags @@ -64,7 +64,7 @@ Everywhere where it's applicable, the resources should include the following tag # Secrets -There are several secrets that need to be defined and provided for the services to work. You can find them in the team's password manager. Each service defines their own set required variables, though all need access to DigitalOcean. See modules' documentation for more details. +Several secrets need to be defined and provided for the services to work. You can find them in the team's password manager. Each service defines its own set of required variables, though all need access to DigitalOcean. See the modules' documentation for more details. ``` ################################# diff --git a/tf-managed/live/environments/prod/applications/snapshot-service/terragrunt.hcl b/tf-managed/live/environments/prod/applications/snapshot-service/terragrunt.hcl index 25c028946..aa384a3f4 100644 --- a/tf-managed/live/environments/prod/applications/snapshot-service/terragrunt.hcl +++ b/tf-managed/live/environments/prod/applications/snapshot-service/terragrunt.hcl @@ -14,7 +14,7 @@ inputs = { size = "s-4vcpu-16gb-amd" r2_endpoint = "https://2238a825c5aca59233eab1f221f7aefb.r2.cloudflarestorage.com/" forest_tag = "v0.16.4" - snapshot_bucket = "forest-archive-dev" + snapshot_bucket = "forest-archive" monitoring = { enable = true, diff --git a/tf-managed/modules/daily-snapshot/monitoring/main.tf b/tf-managed/modules/daily-snapshot/monitoring/main.tf index 71197e3ac..19079d5ca 100644 --- a/tf-managed/modules/daily-snapshot/monitoring/main.tf +++ b/tf-managed/modules/daily-snapshot/monitoring/main.tf @@ -19,14 +19,14 @@ resource "newrelic_nrql_alert_condition" "disk_space" { critical { operator = "above" - threshold = 85.0 + threshold = 95.0 threshold_duration = 300 threshold_occurrences = "ALL" } warning { operator = "above" - threshold = 70.0 + threshold = 85.0 threshold_duration = 300 threshold_occurrences = "ALL" } diff --git a/tf-managed/modules/daily-snapshot/service/newrelic_fail2ban.sh b/tf-managed/modules/daily-snapshot/service/newrelic_fail2ban.sh index 00d885dcc..0e62ed350 100644 --- a/tf-managed/modules/daily-snapshot/service/newrelic_fail2ban.sh +++ b/tf-managed/modules/daily-snapshot/service/newrelic_fail2ban.sh @@ -17,11 +17,11 @@ if [ -n "$NEW_RELIC_API_KEY" ] ; then # https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent/configuration/infrastructure-agent-configuration-settings/#offline-time-to-reset cat >> /etc/newrelic-infra.yml < Date: Fri, 19 Jan 2024 11:53:08 +0100 Subject: [PATCH 45/46] self-review 2 --- composite-action/terragrunt/action.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/composite-action/terragrunt/action.yml b/composite-action/terragrunt/action.yml index f352dec87..d0ee8beb3 100644 --- a/composite-action/terragrunt/action.yml +++ b/composite-action/terragrunt/action.yml @@ -141,13 +141,13 @@ runs: run: exit 1 - name: Configure ssh-agent - # if: github.ref == 'refs/heads/main' && ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) + if: github.ref == 'refs/heads/main' && ( github.event_name == 'push' || github.event_name == 'workflow_dispatch' ) uses: webfactory/ssh-agent@v0.8.0 with: ssh-private-key: ${{ inputs.ssh_private_key }} - name: Terragrunt Apply - # if: github.ref == 'refs/heads/main' && github.event_name == 'push' + if: github.ref == 'refs/heads/main' && github.event_name == 'push' shell: bash run: | if grep -q 'No changes.' ${{ github.workspace }}/tfplan; then From 249a74b9ec68928862ecc56eeae196eca7f7ac53 Mon Sep 17 00:00:00 2001 From: Hubert Bugaj Date: Fri, 19 Jan 2024 18:15:37 +0100 Subject: [PATCH 46/46] fix wildcards workflows --- .github/workflows/deploy-daily-snapshot.yml | 2 +- .github/workflows/deploy-sync-check.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy-daily-snapshot.yml b/.github/workflows/deploy-daily-snapshot.yml index 1b686daa3..c79e4fa51 100644 --- a/.github/workflows/deploy-daily-snapshot.yml +++ b/.github/workflows/deploy-daily-snapshot.yml @@ -8,7 +8,7 @@ on: paths: - 'tf-managed/modules/daily_snapshot/**' - 'tf-managed/scripts/**' - - 'tf-managed/live/environments/prod/applications/snapshot-service' + - 'tf-managed/live/environments/prod/applications/snapshot-service/**' # This needs to be declared explicitly so that the job is actually # run when moved out of draft. types: [opened, synchronize, reopened, ready_for_review] diff --git a/.github/workflows/deploy-sync-check.yml b/.github/workflows/deploy-sync-check.yml index 4c1eda309..3c4f86aa2 100644 --- a/.github/workflows/deploy-sync-check.yml +++ b/.github/workflows/deploy-sync-check.yml @@ -8,7 +8,7 @@ on: paths: - 'tf-managed/modules/sync-check/**' - 'tf-managed/scripts/**' - - 'tf-managed/live/environments/prod/applications/sync-check' + - 'tf-managed/live/environments/prod/applications/sync-check/**' # This needs to be declared explicitly so that the job is actually # run when moved out of draft. types: [opened, synchronize, reopened, ready_for_review]