Skip to content

Commit

Permalink
add optional alert config (#36)
Browse files Browse the repository at this point in the history
Remaining stuff I'd like to make configurable:
- alert thresholds
- auto-close timeout

---------

Signed-off-by: Jason Hall <[email protected]>
  • Loading branch information
imjasonh authored Dec 22, 2023
1 parent 1a1de58 commit 2cbbb96
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 60 deletions.
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ No modules.
| [google_compute_target_https_proxy.prober](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_target_https_proxy) | resource |
| [google_compute_url_map.probers](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_url_map) | resource |
| [google_dns_record_set.prober_dns](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/dns_record_set) | resource |
| [google_monitoring_alert_policy.uptime_alert](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_alert_policy) | resource |
| [google_monitoring_uptime_check_config.global_uptime_check](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_uptime_check_config) | resource |
| [google_monitoring_uptime_check_config.regional_uptime_check](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/monitoring_uptime_check_config) | resource |
| [ko_build.image](https://registry.terraform.io/providers/ko-build/ko/latest/docs/resources/build) | resource |
Expand All @@ -136,14 +137,17 @@ No modules.

| Name | Description | Type | Default | Required |
|------|-------------|------|---------|:--------:|
| <a name="input_alert_description"></a> [alert\_description](#input\_alert\_description) | Alert documentation. Use this to link to playbooks or give additional context. | `string` | `"An uptime check has failed."` | no |
| <a name="input_cpu"></a> [cpu](#input\_cpu) | The CPU limit for the prober. | `string` | `"1000m"` | no |
| <a name="input_dns_zone"></a> [dns\_zone](#input\_dns\_zone) | The managed DNS zone in which to create prober record sets (required for multiple locations). | `string` | `""` | no |
| <a name="input_domain"></a> [domain](#input\_domain) | The domain of the environment to probe (required for multiple locations). | `string` | `""` | no |
| <a name="input_enable_alert"></a> [enable\_alert](#input\_enable\_alert) | If true, alert on failures. Outputs will return the alert ID for notification and dashboards. | `bool` | `false` | no |
| <a name="input_env"></a> [env](#input\_env) | A map of custom environment variables (e.g. key=value) | `map` | `{}` | no |
| <a name="input_importpath"></a> [importpath](#input\_importpath) | The import path that contains the prober application. | `string` | n/a | yes |
| <a name="input_locations"></a> [locations](#input\_locations) | Where to run the Cloud Run services. | `list(string)` | <pre>[<br> "us-central1"<br>]</pre> | no |
| <a name="input_memory"></a> [memory](#input\_memory) | The memory limit for the prober. | `string` | `"512Mi"` | no |
| <a name="input_name"></a> [name](#input\_name) | Name to prefix to created resources. | `any` | n/a | yes |
| <a name="input_name"></a> [name](#input\_name) | Name to prefix to created resources. | `string` | n/a | yes |
| <a name="input_notification_channels"></a> [notification\_channels](#input\_notification\_channels) | A list of notification channels to send alerts to. | `list(string)` | `[]` | no |
| <a name="input_period"></a> [period](#input\_period) | The period for the prober in seconds. | `string` | `"300s"` | no |
| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project that will host the prober. | `string` | n/a | yes |
| <a name="input_repository"></a> [repository](#input\_repository) | Container repository to publish images to. | `string` | `""` | no |
Expand All @@ -155,6 +159,7 @@ No modules.

| Name | Description |
|------|-------------|
| <a name="output_alert_id"></a> [alert\_id](#output\_alert\_id) | n/a |
| <a name="output_uptime_check"></a> [uptime\_check](#output\_uptime\_check) | n/a |
| <a name="output_uptime_check_name"></a> [uptime\_check\_name](#output\_uptime\_check\_name) | n/a |
<!-- END_TF_DOCS -->
53 changes: 53 additions & 0 deletions alert.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
Copyright 2023 Chainguard, Inc.
SPDX-License-Identifier: Apache-2.0
*/

// Create an alert policy based on the uptime check.
resource "google_monitoring_alert_policy" "uptime_alert" {
count = var.enable_alert ? 1 : 0
project = var.project_id

# In the absence of data, incident will auto-close in 7 days
alert_strategy {
auto_close = "${7 * 24 * 60 * 60}s"
}
combiner = "OR"

conditions {
condition_threshold {
aggregations {
alignment_period = "300s"
cross_series_reducer = "REDUCE_COUNT_FALSE"
group_by_fields = ["resource.*"]
per_series_aligner = "ALIGN_NEXT_OLDER"
}

comparison = "COMPARISON_GT"
duration = "60s"
filter = <<-EOT
metric.type="monitoring.googleapis.com/uptime_check/check_passed"
resource.type="uptime_url"
metric.label.check_id="${local.uptime_check_id}"
EOT

// TODO(jason): Make these configurable.
threshold_value = 1
trigger {
count = 1
percent = 0
}
}

display_name = "${local.uptime_check_name} probe failure"
}

display_name = "${local.uptime_check_name} prober failed alert"
enabled = true

documentation {
content = var.alert_description
}

notification_channels = var.notification_channels
}
49 changes: 2 additions & 47 deletions examples/basic/example.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ variable "project_id" {
}

resource "google_service_account" "prober" {
project = var.project_id
project = var.project_id
account_id = "basic-example-prober"
}

Expand All @@ -37,51 +37,6 @@ module "prober" {
env = {
EXAMPLE_ENV = "honk"
}
}

// Create an alert policy based on the uptime check above.
resource "google_monitoring_alert_policy" "prober_uptime" {
project = var.project_id
# In the absence of data, incident will auto-close in 7 days
alert_strategy {
auto_close = "604800s"
}
combiner = "OR"

conditions {
condition_threshold {
aggregations {
alignment_period = "300s"
cross_series_reducer = "REDUCE_COUNT_FALSE"
group_by_fields = ["resource.*"]
per_series_aligner = "ALIGN_NEXT_OLDER"
}

comparison = "COMPARISON_GT"
duration = "60s"
filter = format("metric.type=\"monitoring.googleapis.com/uptime_check/check_passed\" resource.type=\"uptime_url\" metric.label.\"check_id\"=\"%s\"", module.prober.uptime_check)

threshold_value = "1"

trigger {
count = "1"
percent = "0"
}
}

display_name = "${module.prober.uptime_check_name} probe failure"
}

display_name = "${module.prober.uptime_check_name} prober failed alert"
enabled = "true"

documentation {
content = <<-EOT
< Add your documentation or link to a playbook here >
EOT
}

depends_on = [
module.prober
]
enable_alert = true
}
3 changes: 2 additions & 1 deletion examples/complex/example.tf
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ resource "google_dns_managed_zone" "prober_zone" {
}

resource "google_service_account" "prober" {
project = var.project_id
project = var.project_id
account_id = "complex-example-prober"
}

Expand All @@ -42,6 +42,7 @@ module "prober" {
name = "complex-example"
project_id = var.project_id
service-account = google_service_account.prober.email
enable_alert = true

importpath = "github.com/chainguard-dev/terraform-google-prober/examples/complex"
working_dir = path.module
Expand Down
12 changes: 3 additions & 9 deletions main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,9 @@ SPDX-License-Identifier: Apache-2.0

terraform {
required_providers {
cosign = {
source = "chainguard-dev/cosign"
}
ko = {
source = "ko-build/ko"
}
google = {
source = "hashicorp/google"
}
cosign = { source = "chainguard-dev/cosign" }
ko = { source = "ko-build/ko" }
google = { source = "hashicorp/google" }
}
}

Expand Down
13 changes: 11 additions & 2 deletions outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,19 @@ Copyright 2022 Chainguard, Inc.
SPDX-License-Identifier: Apache-2.0
*/

locals {
uptime_check_name = local.use_gclb ? google_monitoring_uptime_check_config.global_uptime_check[0].display_name : google_monitoring_uptime_check_config.regional_uptime_check[0].display_name
uptime_check_id = local.use_gclb ? google_monitoring_uptime_check_config.global_uptime_check[0].uptime_check_id : google_monitoring_uptime_check_config.regional_uptime_check[0].uptime_check_id
}

output "uptime_check" {
value = local.use_gclb ? google_monitoring_uptime_check_config.global_uptime_check[0].uptime_check_id : google_monitoring_uptime_check_config.regional_uptime_check[0].uptime_check_id
value = local.uptime_check_id
}

output "uptime_check_name" {
value = local.use_gclb ? google_monitoring_uptime_check_config.global_uptime_check[0].display_name : google_monitoring_uptime_check_config.regional_uptime_check[0].display_name
value = local.uptime_check_name
}

output "alert_id" {
value = var.enable_alert ? google_monitoring_alert_policy.uptime_alert[0].id : ""
}
19 changes: 19 additions & 0 deletions variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ SPDX-License-Identifier: Apache-2.0
*/

variable "name" {
type = string
description = "Name to prefix to created resources."
}

Expand Down Expand Up @@ -79,3 +80,21 @@ variable "memory" {
default = "512Mi"
description = "The memory limit for the prober."
}

variable "enable_alert" {
type = bool
default = false
description = "If true, alert on failures. Outputs will return the alert ID for notification and dashboards."
}

variable "alert_description" {
type = string
default = "An uptime check has failed."
description = "Alert documentation. Use this to link to playbooks or give additional context."
}

variable "notification_channels" {
type = list(string)
default = []
description = "A list of notification channels to send alerts to."
}

0 comments on commit 2cbbb96

Please sign in to comment.