From 58e290d461557309d051c2268707dd60701572aa Mon Sep 17 00:00:00 2001
From: Monty Dawson <wgd23@cam.ac.uk>
Date: Tue, 15 Jun 2021 14:01:08 +0100
Subject: [PATCH] Surface the alerting percentage variable.

---
 main.tf      |  7 ++++---
 variables.tf | 13 +++++++++++++
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/main.tf b/main.tf
index bcba828..4c82a33 100644
--- a/main.tf
+++ b/main.tf
@@ -248,9 +248,10 @@ module "uptime_monitoring" {
   uptime_check = {
     # Accept either e.g. "60s" or 60 for timeout and periods for compatibility
     # with previous releases.
-    timeout = tonumber(trimsuffix(var.alerting_uptime_timeout, "s"))
-    period  = tonumber(trimsuffix(var.alerting_uptime_period, "s"))
-    path    = var.monitoring_path
+    timeout                   = tonumber(trimsuffix(var.alerting_uptime_timeout, "s"))
+    period                    = tonumber(trimsuffix(var.alerting_uptime_period, "s"))
+    path                      = var.monitoring_path
+    success_threshold_percent = var.alerting_success_threshold_percent
 
     alert_enabled = var.alerting_enabled
   }
diff --git a/variables.tf b/variables.tf
index 0541a9d..22a4191 100644
--- a/variables.tf
+++ b/variables.tf
@@ -122,6 +122,19 @@ variable "alerting_uptime_period" {
   description = "Frequency of uptime checks"
 }
 
+variable "alerting_success_threshold_percent" {
+  type        = number
+  default     = 75
+  description = <<EOT
+If the percentage of successful uptime checks within the given uptime period falls below
+this, an alert will be triggered. Set to 100 to trigger an alert if any uptime check fails,
+set to a lower number to tolerate failures without alerting.
+
+Experience has taught us that uptime checks can fail semi-regularly due to transient
+problems outside our control, therefore we allow some leeway before triggering an alert.
+EOT
+}
+
 variable "alerting_enabled" {
   type        = bool
   default     = true
-- 
GitLab