From 58e290d461557309d051c2268707dd60701572aa Mon Sep 17 00:00:00 2001 From: Monty Dawson <wgd23@cam.ac.uk> Date: Tue, 15 Jun 2021 14:01:08 +0100 Subject: [PATCH] Surface the alerting percentage variable. --- main.tf | 7 ++++--- variables.tf | 13 +++++++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/main.tf b/main.tf index bcba828..4c82a33 100644 --- a/main.tf +++ b/main.tf @@ -248,9 +248,10 @@ module "uptime_monitoring" { uptime_check = { # Accept either e.g. "60s" or 60 for timeout and periods for compatibility # with previous releases. - timeout = tonumber(trimsuffix(var.alerting_uptime_timeout, "s")) - period = tonumber(trimsuffix(var.alerting_uptime_period, "s")) - path = var.monitoring_path + timeout = tonumber(trimsuffix(var.alerting_uptime_timeout, "s")) + period = tonumber(trimsuffix(var.alerting_uptime_period, "s")) + path = var.monitoring_path + success_threshold_percent = var.alerting_success_threshold_percent alert_enabled = var.alerting_enabled } diff --git a/variables.tf b/variables.tf index 0541a9d..22a4191 100644 --- a/variables.tf +++ b/variables.tf @@ -122,6 +122,19 @@ variable "alerting_uptime_period" { description = "Frequency of uptime checks" } +variable "alerting_success_threshold_percent" { + type = number + default = 75 + description = <<EOT +If the percentage of successful uptime checks within the given uptime period falls below +this, an alert will be triggered. Set to 100 to trigger an alert if any uptime check fails, +set to a lower number to tolerate failures without alerting. + +Experience has taught us that uptime checks can fail semi-regularly due to transient +problems outside our control, therefore we allow some leeway before triggering an alert. +EOT +} + variable "alerting_enabled" { type = bool default = true -- GitLab