From 9181ba705310d3d27728f5127f5321fe6f19d251 Mon Sep 17 00:00:00 2001 From: Scott Gress Date: Fri, 3 Jan 2025 16:24:01 -0600 Subject: [PATCH] Update Dogfood monitoring to v1.5 (#24425) --- .github/workflows/dogfood-deploy.yml | 5 +- .../dogfood/terraform/aws-tf-module/main.tf | 50 +++++++++++++++---- 2 files changed, 43 insertions(+), 12 deletions(-) diff --git a/.github/workflows/dogfood-deploy.yml b/.github/workflows/dogfood-deploy.yml index bae6a032e97d..f48da405acc5 100644 --- a/.github/workflows/dogfood-deploy.yml +++ b/.github/workflows/dogfood-deploy.yml @@ -4,7 +4,7 @@ on: workflow_dispatch: inputs: DOCKER_IMAGE: - description: 'The full name of the docker image to be deployed. (e.g. fleetdm/fleet:v4.30.0). Note: do not use fleetdm/fleet:main directly. Use the short hash instead. If pull-rate limited, try using the quay.io/fleetdm/fleet mirror.' + description: "The full name of the docker image to be deployed. (e.g. fleetdm/fleet:v4.30.0). Note: do not use fleetdm/fleet:main directly. Use the short hash instead. If pull-rate limited, try using the quay.io/fleetdm/fleet mirror." required: true # This allows a subsequently queued workflow run to interrupt previous runs @@ -26,7 +26,8 @@ env: TF_WORKSPACE: fleet TF_VAR_fleet_image: ${{ github.event.inputs.DOCKER_IMAGE || 'fleetdm/fleet:main' }} TF_VAR_fleet_license: ${{ secrets.DOGFOOD_LICENSE_KEY }} - TF_VAR_slack_webhook: ${{ secrets.SLACK_G_HELP_P1_WEBHOOK_URL }} + TF_VAR_slack_p1_webhook: ${{ secrets.SLACK_G_HELP_P1_WEBHOOK_URL }} + TF_VAR_slack_p2_webhook: ${{ secrets.SLACK_G_HELP_P2_WEBHOOK_URL }} TF_VAR_fleet_sentry_dsn: ${{ secrets.DOGFOOD_SENTRY_DSN }} TF_VAR_elastic_url: ${{ secrets.ELASTIC_APM_SERVER_URL }} TF_VAR_elastic_token: ${{ secrets.ELASTIC_APM_SECRET_TOKEN }} diff --git a/infrastructure/dogfood/terraform/aws-tf-module/main.tf b/infrastructure/dogfood/terraform/aws-tf-module/main.tf index 2e717e692bdd..c7d9c2fa4da6 100644 --- a/infrastructure/dogfood/terraform/aws-tf-module/main.tf +++ b/infrastructure/dogfood/terraform/aws-tf-module/main.tf @@ -369,17 +369,32 @@ module "osquery-carve" { } module "monitoring" { - source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=tf-mod-addon-monitoring-v1.1.3" + source = "github.com/fleetdm/fleet//terraform/addons/monitoring?ref=tf-mod-addon-monitoring-v1.5.0" customer_prefix = local.customer - fleet_ecs_service_name = module.main.byo-vpc.byo-db.byo-ecs.service.name - fleet_min_containers = module.main.byo-vpc.byo-db.byo-ecs.service.desired_count - alb_name = module.main.byo-vpc.byo-db.alb.lb_dns_name - alb_target_group_name = module.main.byo-vpc.byo-db.alb.target_group_names[0] - alb_target_group_arn_suffix = module.main.byo-vpc.byo-db.alb.target_group_arn_suffixes[0] - alb_arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix + albs = [ + { + name = module.main.byo-vpc.byo-db.alb.lb_dns_name, + target_group_name = module.main.byo-vpc.byo-db.alb.target_group_names[0] + target_group_arn_suffix = module.main.byo-vpc.byo-db.alb.target_group_arn_suffixes[0] + arn_suffix = module.main.byo-vpc.byo-db.alb.lb_arn_suffix + ecs_service_name = module.main.byo-vpc.byo-db.byo-ecs.service.name + min_containers = module.main.byo-vpc.byo-db.byo-ecs.appautoscaling_target.min_capacity + alert_thresholds = { + HTTPCode_ELB_5XX_Count = { + period = 3600 + threshold = 2 + }, + HTTPCode_Target_5XX_Count = { + period = 120 + threshold = 0 + } + } + } + ] sns_topic_arns_map = { alb_httpcode_5xx = [module.notify_slack.slack_topic_arn] cron_monitoring = [module.notify_slack.slack_topic_arn] + cron_job_failure_monitoring = [module.notify_slack_p2.slack_topic_arn] } mysql_cluster_members = module.main.byo-vpc.rds.cluster_members # The cloudposse module seems to have a nested list here. @@ -452,7 +467,11 @@ resource "aws_kms_key" "ecr" { enable_key_rotation = true } -variable "slack_webhook" { +variable "slack_p1_webhook" { + type = string +} + +variable "slack_p2_webhook" { type = string } @@ -460,13 +479,24 @@ module "notify_slack" { source = "terraform-aws-modules/notify-slack/aws" version = "5.5.0" - sns_topic_name = "fleet-dogfood" + sns_topic_name = "fleet-dogfood-p1-alerts" - slack_webhook_url = var.slack_webhook + slack_webhook_url = var.slack_p1_webhook slack_channel = "#help-p1" slack_username = "monitoring" } +module "notify_slack_p2" { + source = "terraform-aws-modules/notify-slack/aws" + version = "5.5.0" + + sns_topic_name = "fleet-dogfood-p2-alerts" + + slack_webhook_url = var.slack_p2_webhook + slack_channel = "#help-p2" + slack_username = "monitoring" +} + module "ses" { source = "github.com/fleetdm/fleet//terraform/addons/ses?ref=tf-mod-addon-ses-v1.0.0" zone_id = aws_route53_zone.main.zone_id