From fed678a266f5fe8461a03ec53e1062c0e299387f Mon Sep 17 00:00:00 2001
From: Monty Dawson <wgd23@cam.ac.uk>
Date: Wed, 14 Apr 2021 16:55:13 +0100
Subject: [PATCH 1/2] Allow for uptime check not to run for up to 120s, and
 explicitly monitor for uptime check failure

---
 main.tf | 38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/main.tf b/main.tf
index 8978d9c..2bbf3bc 100644
--- a/main.tf
+++ b/main.tf
@@ -44,14 +44,14 @@ resource "google_monitoring_notification_channel" "email" {
 
 resource "google_monitoring_alert_policy" "uptime_alert" {
   enabled      = local.uptime_check.alert_enabled
-  display_name = "Uptime check for ${var.host}"
+  display_name = "Uptime check for ${var.host} (${terraform.workspace})"
   notification_channels = [
     for channel in google_monitoring_notification_channel.email : channel.id
   ]
 
   combiner = "OR"
   conditions {
-    display_name = "Uptime check for ${var.host}"
+    display_name = "Uptime check has run for ${var.host} (${terraform.workspace})"
 
     condition_threshold {
       filter = trimspace(replace(
@@ -62,13 +62,13 @@ resource "google_monitoring_alert_policy" "uptime_alert" {
         EOT
         , "\n", " "
       ))
-      duration   = "60s"
+      duration   = "120s"
       comparison = "COMPARISON_LT"
 
       threshold_value = "1"
 
       aggregations {
-        alignment_period     = "60s"
+        alignment_period     = "120s"
         cross_series_reducer = "REDUCE_COUNT_TRUE"
         group_by_fields = [
           "resource.*",
@@ -79,6 +79,36 @@ resource "google_monitoring_alert_policy" "uptime_alert" {
       trigger { count = "1" }
     }
   }
+
+  conditions {
+    display_name = "Uptime check failed for ${var.host} (${terraform.workspace})"
+
+    condition_threshold {
+      filter = trimspace(replace(
+        <<-EOT
+        metric.type="monitoring.googleapis.com/uptime_check/check_passed" AND
+        metric.label.check_id="${google_monitoring_uptime_check_config.https.uptime_check_id}" AND
+        resource.type="uptime_url"
+        EOT
+        , "\n", " "
+      ))
+      duration   = "60s"
+      comparison = "COMPARISON_GT"
+
+      threshold_value = "0"
+
+      aggregations {
+        alignment_period     = "60s"
+        cross_series_reducer = "REDUCE_COUNT_FALSE"
+        group_by_fields = [
+          "resource.*",
+        ]
+        per_series_aligner = "ALIGN_NEXT_OLDER"
+      }
+
+      trigger { count = "1" }
+    }
+  }
 }
 
 resource "google_monitoring_alert_policy" "ssl_cert_expiry" {
-- 
GitLab


From caf62c469ef7683e1f15af69ca7f4e1acdaa69a2 Mon Sep 17 00:00:00 2001
From: Monty Dawson <wgd23@cam.ac.uk>
Date: Sun, 18 Apr 2021 14:59:01 +0100
Subject: [PATCH 2/2] Use condition absent to detect no data from uptime checks

This guards against opening two incidents when
an uptime check fails, which was the downside of
the previous approach of expecting x amount of
successful uptime checks within a given timeframe.
---
 main.tf | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/main.tf b/main.tf
index 2bbf3bc..0bfdac3 100644
--- a/main.tf
+++ b/main.tf
@@ -50,10 +50,11 @@ resource "google_monitoring_alert_policy" "uptime_alert" {
   ]
 
   combiner = "OR"
+
   conditions {
     display_name = "Uptime check has run for ${var.host} (${terraform.workspace})"
 
-    condition_threshold {
+    condition_absent {
       filter = trimspace(replace(
         <<-EOT
         metric.type="monitoring.googleapis.com/uptime_check/check_passed" AND
@@ -62,18 +63,15 @@ resource "google_monitoring_alert_policy" "uptime_alert" {
         EOT
         , "\n", " "
       ))
-      duration   = "120s"
-      comparison = "COMPARISON_LT"
-
-      threshold_value = "1"
+      # absent conditions have to have a min duration of 2 minutes
+      duration = "${max(local.uptime_check.period, 120)}s"
 
       aggregations {
-        alignment_period     = "120s"
-        cross_series_reducer = "REDUCE_COUNT_TRUE"
-        group_by_fields = [
-          "resource.*",
-        ]
-        per_series_aligner = "ALIGN_NEXT_OLDER"
+        # absent conditions have to have a min duration of 2 minutes
+        alignment_period     = "${max(local.uptime_check.period, 120)}s"
+        cross_series_reducer = "REDUCE_COUNT"
+        group_by_fields      = []
+        per_series_aligner   = "ALIGN_COUNT"
       }
 
       trigger { count = "1" }
-- 
GitLab