FAQ | This is a LIVE service | Changelog

Skip to content
Snippets Groups Projects
Commit b7fec48b authored by Monty Dawson's avatar Monty Dawson :coffee:
Browse files

feat(alerting): Add failure alerts to catch explicit failures during invocation.

parent 825f1472
No related branches found
No related tags found
1 merge request!11feat(alerting): Add failure alerts to catch explicit failures during invocation.
Pipeline #533970 passed
......@@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [2.4.0] - 2024-05-29
### Added
- Added support for explicit failure alerts, enabled via the `failure_alert_enabled` variable and configurable
via the `alert_failure_threshold` and `alert_failure_period` variables.
## [2.3.1] - 2024-02-05
### Added
......
......@@ -259,3 +259,72 @@ resource "google_monitoring_alert_policy" "successes" {
provider = google.monitoring
}
# A custom log metric used to select unsuccessful invocations of the script via the
# Cloud Scheduler job.
resource "google_logging_metric" "failures" {
project = local.project
name = "script/failures-${local.kebab_name}"
filter = trimspace(replace(
<<-EOI
resource.type="cloud_scheduler_job"
AND resource.labels.project_id = "${local.project}"
AND resource.labels.job_id = "${google_cloud_scheduler_job.script.name}"
AND httpRequest.status>=400
EOI
, "\n", " "))
metric_descriptor {
metric_kind = "DELTA"
value_type = "INT64"
}
}
# Alerting if a script failure is detected.
resource "google_monitoring_alert_policy" "failures" {
project = local.monitoring_project
display_name = "Invocations of ${var.name} script failed (${terraform.workspace})"
enabled = var.failure_alert_enabled
notification_channels = local.notification_channels
combiner = "OR"
conditions {
# Include the workspace name in the alert text to aid triaging alerts.
display_name = "Invocations of ${var.name} script failed (${terraform.workspace})"
condition_threshold {
# We're interested in Cloud Scheduler failures for the job in the
# appropriate project.
filter = trimspace(replace(
<<-EOI
metric.type="logging.googleapis.com/user/${google_logging_metric.failures.id}"
AND resource.type="cloud_scheduler_job"
AND resource.label.project_id="${local.project}"
EOI
, "\n", " "
))
# We check our threshold value every minute.
duration = "60s"
comparison = "COMPARISON_GT"
# The metric value is the number of failures in a given period.
aggregations {
alignment_period = var.alert_failure_period
per_series_aligner = "ALIGN_SUM"
cross_series_reducer = "REDUCE_SUM"
}
threshold_value = var.alert_failure_threshold
}
}
provider = google.monitoring
}
\ No newline at end of file
......@@ -150,6 +150,32 @@ variable "alert_enabled" {
EOT
}
variable "failure_alert_enabled" {
type = bool
default = false
description = <<-EOT
Flag indicating if failure alerts should be enabled for this script, separate
to the default alert which is raised if a successful invocation has not been
detected within the alerting period.
EOT
}
variable "alert_failure_threshold" {
type = number
default = 1
description = <<-EOT
The number of failures which will trigger a failure alert within 'alert_success_period'.
EOT
}
variable "alert_failure_period" {
type = string
default = "3600s"
description = <<-EOT
Period over which 'alert_failure_threshold' is used.
EOT
}
variable "secret_configuration" {
type = string
default = ""
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment