Skip to content

Commit

Permalink
cleanup log-based alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
LesnyRumcajs committed Jan 18, 2024
1 parent 171f094 commit 84ed14a
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 120 deletions.
28 changes: 0 additions & 28 deletions tf-managed/modules/daily-snapshot/monitoring/event_rules.tf

This file was deleted.

117 changes: 25 additions & 92 deletions tf-managed/modules/daily-snapshot/monitoring/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,31 @@ locals {
enable_email = var.alert_email != ""
}

# resource "newrelic_nrql_alert_condition" "disk_space" {
# policy_id = newrelic_alert_policy.alert.id
# type = "static"
# name = "High Disk Utilization"
# description = "Alert when disk space usage is high on an the service host"
# enabled = true
#
# nrql {
# query = "SELECT latest(diskUsedPercent) FROM StorageSample where entityName = '${var.service_name}'"
# }
#
# critical {
# operator = "above"
# # threshold = 85.0
# threshold = 20.0
# threshold_duration = 300
# threshold_occurrences = "ALL"
# }
#
# warning {
# operator = "above"
# # threshold = 70.0
# threshold = 10.0
# threshold_duration = 300
# threshold_occurrences = "ALL"
# }
# }
resource "newrelic_nrql_alert_condition" "disk_space" {
policy_id = newrelic_alert_policy.alert.id
type = "static"
name = "High Disk Utilization"
description = "Alert when disk space usage is high on an the service host"
enabled = true

nrql {
query = "SELECT latest(diskUsedPercent) FROM StorageSample where entityName = '${var.service_name}'"
}

critical {
operator = "above"
threshold = 85.0
threshold_duration = 300
threshold_occurrences = "ALL"
}

warning {
operator = "above"
threshold = 70.0
threshold_duration = 300
threshold_occurrences = "ALL"
}
}

resource "newrelic_notification_destination" "email" {
count = local.enable_email ? 1 : 0
Expand Down Expand Up @@ -120,68 +118,3 @@ resource "newrelic_workflow" "alerting-workflow-slack" {
channel_id = newrelic_notification_channel.slack-channel[0].id
}
}

# At least 1 snapshot is generated in 5 hours interval
resource "newrelic_nrql_alert_condition" "snapshot_frequency_condition" {
for_each = toset(["mainnet", "calibnet"])
policy_id = newrelic_alert_policy.alert.id
type = "static"
name = format("Low snapshot generation frequency - %s", each.key)
description = "Alert when snapshots are not generated within requried time interval"
enabled = true

# evaluation_delay = 7200 # 2 hours, it may take some time to generate a snapshot
# aggregation_window = 14400 # 4 hours, it may take some time to generate a snapshot
aggregation_window = 360 # 4 hours, it may take some time to generate a snapshot


nrql {
query = format("FROM Metric SELECT count(`${var.service_name}.${each.key}.snapshot_generation_ok`)")
}

warning {
operator = "below"
threshold = 1
# threshold_duration = 14400
threshold_duration = 360
threshold_occurrences = "ALL"
}

critical {
operator = "below"
threshold = 1
# threshold_duration = 28800
threshold_duration = 720
threshold_occurrences = "ALL"
}
}

# At least 1 successful snapshot out of 3 attempts

#resource "newrelic_nrql_alert_condition" "disk_space" {
# policy_id = newrelic_alert_policy.alert.id
# type = "static"
# name = "High Disk Utilization"
# description = "Alert when disk space usage is high on an the service host"
# enabled = true
#
# nrql {
# query = "SELECT latest(diskUsedPercent) FROM StorageSample where entityName = '${var.service_name}'"
# }
#
# critical {
# operator = "above"
# # threshold = 85.0
# threshold = 20.0
# threshold_duration = 300
# threshold_occurrences = "ALL"
# }
#
# warning {
# operator = "above"
# # threshold = 70.0
# threshold = 10.0
# threshold_duration = 300
# threshold_occurrences = "ALL"
# }
#}

0 comments on commit 84ed14a

Please sign in to comment.