From fed678a266f5fe8461a03ec53e1062c0e299387f Mon Sep 17 00:00:00 2001 From: Monty Dawson <wgd23@cam.ac.uk> Date: Wed, 14 Apr 2021 16:55:13 +0100 Subject: [PATCH 1/2] Allow for uptime check not to run for up to 120s, and explicitly monitor for uptime check failure --- main.tf | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/main.tf b/main.tf index 8978d9c..2bbf3bc 100644 --- a/main.tf +++ b/main.tf @@ -44,14 +44,14 @@ resource "google_monitoring_notification_channel" "email" { resource "google_monitoring_alert_policy" "uptime_alert" { enabled = local.uptime_check.alert_enabled - display_name = "Uptime check for ${var.host}" + display_name = "Uptime check for ${var.host} (${terraform.workspace})" notification_channels = [ for channel in google_monitoring_notification_channel.email : channel.id ] combiner = "OR" conditions { - display_name = "Uptime check for ${var.host}" + display_name = "Uptime check has run for ${var.host} (${terraform.workspace})" condition_threshold { filter = trimspace(replace( @@ -62,13 +62,13 @@ resource "google_monitoring_alert_policy" "uptime_alert" { EOT , "\n", " " )) - duration = "60s" + duration = "120s" comparison = "COMPARISON_LT" threshold_value = "1" aggregations { - alignment_period = "60s" + alignment_period = "120s" cross_series_reducer = "REDUCE_COUNT_TRUE" group_by_fields = [ "resource.*", @@ -79,6 +79,36 @@ resource "google_monitoring_alert_policy" "uptime_alert" { trigger { count = "1" } } } + + conditions { + display_name = "Uptime check failed for ${var.host} (${terraform.workspace})" + + condition_threshold { + filter = trimspace(replace( + <<-EOT + metric.type="monitoring.googleapis.com/uptime_check/check_passed" AND + metric.label.check_id="${google_monitoring_uptime_check_config.https.uptime_check_id}" AND + resource.type="uptime_url" + EOT + , "\n", " " + )) + duration = "60s" + comparison = "COMPARISON_GT" + + threshold_value = "0" + + aggregations { + alignment_period = "60s" + cross_series_reducer = "REDUCE_COUNT_FALSE" + group_by_fields = [ + "resource.*", + ] + per_series_aligner = "ALIGN_NEXT_OLDER" + } + + trigger { count = "1" } + } + } } resource "google_monitoring_alert_policy" "ssl_cert_expiry" { -- GitLab From caf62c469ef7683e1f15af69ca7f4e1acdaa69a2 Mon Sep 17 00:00:00 2001 From: Monty Dawson <wgd23@cam.ac.uk> Date: Sun, 18 Apr 2021 14:59:01 +0100 Subject: [PATCH 2/2] Use condition absent to detect no data from uptime checks This guards against opening two incidents when an uptime check fails, which was the downside of the previous approach of expecting x amount of successful uptime checks within a given timeframe. --- main.tf | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/main.tf b/main.tf index 2bbf3bc..0bfdac3 100644 --- a/main.tf +++ b/main.tf @@ -50,10 +50,11 @@ resource "google_monitoring_alert_policy" "uptime_alert" { ] combiner = "OR" + conditions { display_name = "Uptime check has run for ${var.host} (${terraform.workspace})" - condition_threshold { + condition_absent { filter = trimspace(replace( <<-EOT metric.type="monitoring.googleapis.com/uptime_check/check_passed" AND @@ -62,18 +63,15 @@ resource "google_monitoring_alert_policy" "uptime_alert" { EOT , "\n", " " )) - duration = "120s" - comparison = "COMPARISON_LT" - - threshold_value = "1" + # absent conditions have to have a min duration of 2 minutes + duration = "${max(local.uptime_check.period, 120)}s" aggregations { - alignment_period = "120s" - cross_series_reducer = "REDUCE_COUNT_TRUE" - group_by_fields = [ - "resource.*", - ] - per_series_aligner = "ALIGN_NEXT_OLDER" + # absent conditions have to have a min duration of 2 minutes + alignment_period = "${max(local.uptime_check.period, 120)}s" + cross_series_reducer = "REDUCE_COUNT" + group_by_fields = [] + per_series_aligner = "ALIGN_COUNT" } trigger { count = "1" } -- GitLab