feat(alerting): Add failure alerts to catch explicit failures during invocation.

b7fec48b · Monty Dawson · 825f1472 · b7fec48b · b7fec48b · b7fec48b
Commit b7fec48b authored 9 months ago by Monty Dawson
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+## [2.4.0] - 2024-05-29
+
+### Added
+
+- Added support for explicit failure alerts, enabled via the `failure_alert_enabled` variable and configurable
+  via the `alert_failure_threshold` and `alert_failure_period` variables.
+
 ## [2.3.1] - 2024-02-05

 ### Added

--- a/main.tf
+++ b/main.tf
@@ -259,3 +259,72 @@ resource "google_monitoring_alert_policy" "successes" {

  provider = google.monitoring
 }
+
+
+# A custom log metric used to select unsuccessful invocations of the script via the
+# Cloud Scheduler job.
+resource "google_logging_metric" "failures" {
+  project = local.project
+
+  name = "script/failures-${local.kebab_name}"
+
+  filter = trimspace(replace(
+    <<-EOI
+      resource.type="cloud_scheduler_job"
+      AND resource.labels.project_id = "${local.project}"
+      AND resource.labels.job_id = "${google_cloud_scheduler_job.script.name}"
+      AND httpRequest.status>=400
+    EOI
+  , "\n", " "))
+
+  metric_descriptor {
+    metric_kind = "DELTA"
+    value_type  = "INT64"
+  }
+}
+
+# Alerting if a script failure is detected.
+resource "google_monitoring_alert_policy" "failures" {
+  project = local.monitoring_project
+
+  display_name = "Invocations of ${var.name} script failed (${terraform.workspace})"
+
+  enabled = var.failure_alert_enabled
+
+  notification_channels = local.notification_channels
+
+  combiner = "OR"
+
+  conditions {
+    # Include the workspace name in the alert text to aid triaging alerts.
+    display_name = "Invocations of ${var.name} script failed (${terraform.workspace})"
+
+    condition_threshold {
+      # We're interested in Cloud Scheduler failures for the job in the
+      # appropriate project.
+      filter = trimspace(replace(
+        <<-EOI
+          metric.type="logging.googleapis.com/user/${google_logging_metric.failures.id}"
+          AND resource.type="cloud_scheduler_job"
+          AND resource.label.project_id="${local.project}"
+        EOI
+        , "\n", " "
+      ))
+
+      # We check our threshold value every minute.
+      duration   = "60s"
+      comparison = "COMPARISON_GT"
+
+      # The metric value is the number of failures in a given period.
+      aggregations {
+        alignment_period     = var.alert_failure_period
+        per_series_aligner   = "ALIGN_SUM"
+        cross_series_reducer = "REDUCE_SUM"
+      }
+
+      threshold_value = var.alert_failure_threshold
+    }
+  }
+
+  provider = google.monitoring
+}
\ No newline at end of file
--- a/variables.tf
+++ b/variables.tf
@@ -150,6 +150,32 @@ variable "alert_enabled" {
  EOT
 }

+variable "failure_alert_enabled" {
+  type        = bool
+  default     = false
+  description = <<-EOT
+    Flag indicating if failure alerts should be enabled for this script, separate
+    to the default alert which is raised if a successful invocation has not been
+    detected within the alerting period.
+  EOT
+}
+
+variable "alert_failure_threshold" {
+  type        = number
+  default     = 1
+  description = <<-EOT
+    The number of failures which will trigger a failure alert within 'alert_success_period'.
+  EOT
+}
+
+variable "alert_failure_period" {
+  type        = string
+  default     = "3600s"
+  description = <<-EOT
+    Period over which 'alert_failure_threshold' is used.
+  EOT
+}
+
 variable "secret_configuration" {
  type        = string
  default     = ""