Skip to content

Commit

Permalink
cleanups, tinkering
Browse files Browse the repository at this point in the history
  • Loading branch information
LesnyRumcajs committed Jan 11, 2024
1 parent 3b3a395 commit 8a9bb7d
Show file tree
Hide file tree
Showing 12 changed files with 173 additions and 105 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ terraform {
}

inputs = {
name = "forest-snapshot"
size = "s-4vcpu-16gb-amd"
r2_endpoint = "https://2238a825c5aca59233eab1f221f7aefb.r2.cloudflarestorage.com/"
forest_tag = "v0.16.4"
name = "forest-snapshot"
size = "s-4vcpu-16gb-amd"
r2_endpoint = "https://2238a825c5aca59233eab1f221f7aefb.r2.cloudflarestorage.com/"
forest_tag = "v0.16.4"
snapshot_bucket = "forest-archive"
}
32 changes: 16 additions & 16 deletions tf-managed/live/terragrunt.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -10,28 +10,28 @@ locals {
remote_state {
backend = "s3"
generate = {
path = "backend.tf"
path = "backend.tf"
if_exists = "overwrite_terragrunt"
}
config = {
// if the environment is dev, use the dev bucket, otherwise use the prod bucket
bucket = (local.env == "prod"
? "hubert-bucket-prod"
: "hubert-bucket-dev"
)
key = "${path_relative_to_include()}/terraform.tfstate"
region = "eu-west-1"
endpoint = "https://fra1.digitaloceanspaces.com"
skip_bucket_versioning = true
skip_bucket_ssencryption = true
skip_bucket_root_access = true
? "hubert-bucket-prod"
: "hubert-bucket-dev"
)
key = "${path_relative_to_include()}/terraform.tfstate"
region = "eu-west-1"
endpoint = "https://fra1.digitaloceanspaces.com"
skip_bucket_versioning = true
skip_bucket_ssencryption = true
skip_bucket_root_access = true
skip_bucket_public_access_blocking = true
skip_bucket_enforced_tls = true
skip_credentials_validation = true
skip_metadata_api_check = true
skip_requesting_account_id = true
skip_s3_checksum = true
skip_region_validation = true
skip_bucket_enforced_tls = true
skip_credentials_validation = true
skip_metadata_api_check = true
skip_requesting_account_id = true
skip_s3_checksum = true
skip_region_validation = true
}
}

Expand Down
12 changes: 6 additions & 6 deletions tf-managed/modules/daily-snapshot/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,12 @@ resource "digitalocean_project_resources" "connect_forest_project" {
}

module "monitoring" {
count = var.monitoring.enable ? 1 : 0
source = "./monitoring"
service_name = local.service_name
alert_email = var.monitoring.alert_email
slack_enable = var.monitoring.slack_enable
count = var.monitoring.enable ? 1 : 0
source = "./monitoring"
service_name = local.service_name
alert_email = var.monitoring.alert_email
slack_enable = var.monitoring.slack_enable
slack_destination_id = var.monitoring.slack_destination_id
slack_channel_id = var.monitoring.slack_channel_id
slack_channel_id = var.monitoring.slack_channel_id
new_relic_account_id = var.new_relic_account_id
}
20 changes: 11 additions & 9 deletions tf-managed/modules/daily-snapshot/monitoring/event_rules.tf
Original file line number Diff line number Diff line change
@@ -1,26 +1,28 @@
# This file constains NR event rules used to generate metrics from logs, given that
# the service is not generating metrics by itself.
resource "newrelic_events_to_metrics_rule" "generate_snapshot_attempt_metrics" {
account_id = var.new_relic_account_id
for_each = toset(["mainnet", "calibnet"])
for_each = toset(["mainnet", "calibnet"])

name = format("%s %s snapshot generation attempts", var.service_name, each.key)
name = format("%s %s snapshot generation attempts", var.service_name, each.key)
description = "Snapshot generation attempts"
nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_run' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%running snapshot export%'"
nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_run' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%running snapshot export%'"
}

resource "newrelic_events_to_metrics_rule" "generate_snapshot_success_metrics" {
account_id = var.new_relic_account_id
for_each = toset(["mainnet", "calibnet"])
for_each = toset(["mainnet", "calibnet"])

name = format("%s %s snapshot generation success", var.service_name, each.key)
name = format("%s %s snapshot generation success", var.service_name, each.key)
description = "Success snapshot generations"
nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_ok' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%Snapshot uploaded for%'"
nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_ok' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%Snapshot uploaded for%'"
}

resource "newrelic_events_to_metrics_rule" "generate_snapshot_fail_metrics" {
account_id = var.new_relic_account_id
for_each = toset(["mainnet", "calibnet"])
for_each = toset(["mainnet", "calibnet"])

name = format("%s %s snapshot generation failure", var.service_name, each.key)
name = format("%s %s snapshot generation failure", var.service_name, each.key)
description = "Failed snapshot generations"
nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_fail' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%Snapshot upload failed for%'"
nrql = "From Log select uniqueCount(message) as '${var.service_name}.${each.key}.snapshot_generation_fail' WHERE `hostname` = '${var.service_name}' AND filePath ='/root/logs/${each.key}_log.txt' AND message LIKE '%Snapshot upload failed for%'"
}
161 changes: 113 additions & 48 deletions tf-managed/modules/daily-snapshot/monitoring/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -6,80 +6,80 @@ locals {
enable_email = var.alert_email != ""
}

resource "newrelic_nrql_alert_condition" "disk_space" {
policy_id = newrelic_alert_policy.alert.id
type = "static"
name = "High Disk Utilization"
description = "Alert when disk space usage is high on an the service host"
enabled = true

nrql {
query = "SELECT latest(diskUsedPercent) FROM StorageSample where entityName = '${var.service_name}'"
}

critical {
operator = "above"
# threshold = 85.0
threshold = 20.0
threshold_duration = 300
threshold_occurrences = "ALL"
}

warning {
operator = "above"
# threshold = 70.0
threshold = 10.0
threshold_duration = 300
threshold_occurrences = "ALL"
}
}
# resource "newrelic_nrql_alert_condition" "disk_space" {
# policy_id = newrelic_alert_policy.alert.id
# type = "static"
# name = "High Disk Utilization"
# description = "Alert when disk space usage is high on an the service host"
# enabled = true
#
# nrql {
# query = "SELECT latest(diskUsedPercent) FROM StorageSample where entityName = '${var.service_name}'"
# }
#
# critical {
# operator = "above"
# # threshold = 85.0
# threshold = 20.0
# threshold_duration = 300
# threshold_occurrences = "ALL"
# }
#
# warning {
# operator = "above"
# # threshold = 70.0
# threshold = 10.0
# threshold_duration = 300
# threshold_occurrences = "ALL"
# }
# }

resource "newrelic_notification_destination" "email" {
count = local.enable_email ? 1 : 0
name = format("%s email", var.service_name)
type = "EMAIL"
type = "EMAIL"

property {
key = "email"
key = "email"
value = var.alert_email
}
}

resource "newrelic_notification_channel" "email-channel" {
count = local.enable_email ? 1 : 0
name = format("%s email", var.service_name)
type = "EMAIL"
product = "IINT"
count = local.enable_email ? 1 : 0
name = format("%s email", var.service_name)
type = "EMAIL"
product = "IINT"
destination_id = newrelic_notification_destination.email[0].id

property {
key = "subject"
key = "subject"
value = format("%s alert", var.service_name)
}
}

resource "newrelic_notification_channel" "slack-channel" {
count = var.slack_enable ? 1 : 0
name = format("%s slack", var.service_name)
type = "SLACK"
count = var.slack_enable ? 1 : 0
name = format("%s slack", var.service_name)
type = "SLACK"
destination_id = var.slack_destination_id
product = "IINT"
product = "IINT"

property {
key = "channelId"
key = "channelId"
value = var.slack_channel_id
}

property {
key = "customDetailsSlack"
key = "customDetailsSlack"
value = "issue id - {{issueId}}"
}
}


resource "newrelic_workflow" "alerting-workflow-mails" {
count = local.enable_email ? 1 : 0
name = format("%s mail alerting workflow", var.service_name)
count = local.enable_email ? 1 : 0
name = format("%s mail alerting workflow", var.service_name)
muting_rules_handling = "NOTIFY_ALL_ISSUES"

issues_filter {
Expand All @@ -88,8 +88,8 @@ resource "newrelic_workflow" "alerting-workflow-mails" {

predicate {
attribute = "labels.policyIds"
operator = "EXACTLY_MATCHES"
values = [ newrelic_alert_policy.alert.id ]
operator = "EXACTLY_MATCHES"
values = [newrelic_alert_policy.alert.id]
}
}

Expand All @@ -101,8 +101,8 @@ resource "newrelic_workflow" "alerting-workflow-mails" {
# Limitation of NR provider - only one workflow can be created per channel. Might be resolved in the future.
# https://registry.terraform.io/providers/newrelic/newrelic/latest/docs/resources/workflow#nested-destination-blocks
resource "newrelic_workflow" "alerting-workflow-slack" {
count = var.slack_enable ? 1 : 0
name = format("%s slack alerting workflow", var.service_name)
count = var.slack_enable ? 1 : 0
name = format("%s slack alerting workflow", var.service_name)
muting_rules_handling = "NOTIFY_ALL_ISSUES"

issues_filter {
Expand All @@ -111,12 +111,77 @@ resource "newrelic_workflow" "alerting-workflow-slack" {

predicate {
attribute = "labels.policyIds"
operator = "EXACTLY_MATCHES"
values = [ newrelic_alert_policy.alert.id ]
operator = "EXACTLY_MATCHES"
values = [newrelic_alert_policy.alert.id]
}
}

destination {
channel_id = newrelic_notification_channel.slack-channel[0].id
}
}

# At least 1 snapshot is generated in 5 hours interval
resource "newrelic_nrql_alert_condition" "snapshot_frequency_condition" {
for_each = toset(["mainnet", "calibnet"])
policy_id = newrelic_alert_policy.alert.id
type = "static"
name = format("Low snapshot generation frequency - %s", each.key)
description = "Alert when snapshots are not generated within requried time interval"
enabled = true

# evaluation_delay = 7200 # 2 hours, it may take some time to generate a snapshot
# aggregation_window = 14400 # 4 hours, it may take some time to generate a snapshot
aggregation_window = 360 # 4 hours, it may take some time to generate a snapshot


nrql {
query = format("FROM Metric SELECT count(`${var.service_name}.${each.key}.snapshot_generation_ok`)")
}

warning {
operator = "below"
threshold = 1
# threshold_duration = 14400
threshold_duration = 360
threshold_occurrences = "ALL"
}

critical {
operator = "below"
threshold = 1
# threshold_duration = 28800
threshold_duration = 720
threshold_occurrences = "ALL"
}
}

# At least 1 successful snapshot out of 3 attempts

#resource "newrelic_nrql_alert_condition" "disk_space" {
# policy_id = newrelic_alert_policy.alert.id
# type = "static"
# name = "High Disk Utilization"
# description = "Alert when disk space usage is high on an the service host"
# enabled = true
#
# nrql {
# query = "SELECT latest(diskUsedPercent) FROM StorageSample where entityName = '${var.service_name}'"
# }
#
# critical {
# operator = "above"
# # threshold = 85.0
# threshold = 20.0
# threshold_duration = 300
# threshold_occurrences = "ALL"
# }
#
# warning {
# operator = "above"
# # threshold = 70.0
# threshold = 10.0
# threshold_duration = 300
# threshold_occurrences = "ALL"
# }
#}
2 changes: 1 addition & 1 deletion tf-managed/modules/daily-snapshot/prep_sources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ set -euxo pipefail
# Copy local source files in a folder together with ruby_common and create a zip archive.

cd "$1"
cp --archive $2/ruby_common service/
cp --archive "$2"/ruby_common service/

rm -f sources.tar
(cd service && tar cf ../sources.tar --sort=name --mtime='UTC 2019-01-01' ./* > /dev/null 2>&1)
Expand Down
13 changes: 7 additions & 6 deletions tf-managed/modules/daily-snapshot/service/newrelic_fail2ban.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ if [ -n "$NEW_RELIC_API_KEY" ] ; then
# https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent/configuration/infrastructure-agent-configuration-settings/#offline-time-to-reset
cat >> /etc/newrelic-infra.yml <<EOF
metrics_network_sample_rate: -1
metrics_process_sample_rate: -1
metrics_system_sample_rate: 600
metrics_storage_sample_rate: 600
metrics_nfs_sample_rate: 600
container_cache_metadata_limit: 600
metrics_process_sample_rate: 60
metrics_system_sample_rate: 60
metrics_storage_sample_rate: 60
metrics_nfs_sample_rate: 60
container_cache_metadata_limit: 60
disable_zero_mem_process_filter: true
disable_all_plugins: true
disable_cloud_metadata: true
Expand All @@ -43,9 +43,10 @@ logs:
attributes:
newrelic-cli: true
logtype: newrelic-cli
- name: snapshot_logs
file: /root/logs/*_log.txt
EOF


sudo systemctl restart newrelic-infra
fi

Expand Down
Loading

0 comments on commit 8a9bb7d

Please sign in to comment.