From 82b02e8a6ea3d8365acd5d69a22b9e04cff4d9c4 Mon Sep 17 00:00:00 2001 From: Pat Heard Date: Thu, 14 Nov 2024 09:17:39 -0500 Subject: [PATCH] feat: add CloudWatch alarm for failed Glue jobs (#23) Add an EventBridge rule and custom CloudWatch metric to capture failed Glue jobs. A new alarm has also been added to trigger when a failure occurs. --- terragrunt/aws/alarms/alarms.tf | 24 ++++++++++++++++-- terragrunt/aws/alarms/eventbridge.tf | 37 ++++++++++++++++++++++++++++ terragrunt/aws/alarms/locals.tf | 3 +++ terragrunt/aws/glue/etl/README.md | 4 +++ 4 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 terragrunt/aws/alarms/eventbridge.tf create mode 100644 terragrunt/aws/glue/etl/README.md diff --git a/terragrunt/aws/alarms/alarms.tf b/terragrunt/aws/alarms/alarms.tf index 413de22..9094b25 100644 --- a/terragrunt/aws/alarms/alarms.tf +++ b/terragrunt/aws/alarms/alarms.tf @@ -7,8 +7,8 @@ resource "aws_cloudwatch_log_metric_filter" "glue_crawler_error" { log_group_name = var.glue_crawler_log_group_name metric_transformation { - name = "glue-crawler-error" - namespace = "data-lake" + name = local.glue_crawler_error_metric_name + namespace = local.data_lake_namespace value = "1" default_value = "0" unit = "Count" @@ -31,6 +31,26 @@ resource "aws_cloudwatch_metric_alarm" "glue_crawler_error" { ok_actions = [aws_sns_topic.cloudwatch_ok_action.arn] } +resource "aws_cloudwatch_metric_alarm" "glue_job_failures" { + alarm_name = "glue-job-failures" + alarm_description = "Failed Glue jobs in a 1 minute period." + comparison_operator = "GreaterThanThreshold" + evaluation_periods = "1" + metric_name = local.glue_job_failure_metric_name + namespace = local.data_lake_namespace + period = "60" + statistic = "Sum" + threshold = "0" + treat_missing_data = "notBreaching" + + alarm_actions = [aws_sns_topic.cloudwatch_alarm_action.arn] + ok_actions = [aws_sns_topic.cloudwatch_ok_action.arn] + + dimensions = { + JobName = "*" + } +} + # # Log Insight queries # diff --git a/terragrunt/aws/alarms/eventbridge.tf b/terragrunt/aws/alarms/eventbridge.tf new file mode 100644 index 0000000..e72018f --- /dev/null +++ b/terragrunt/aws/alarms/eventbridge.tf @@ -0,0 +1,37 @@ +resource "aws_cloudwatch_event_rule" "glue_job_failure" { + name = "glue-job-failures" + description = "Capture Glue job failures and timeouts" + + event_pattern = jsonencode({ + source = ["aws.glue"] + detail-type = ["Glue Job State Change"] + detail = { + state = ["FAILED", "TIMEOUT", "ERROR"] + } + }) +} + +resource "aws_cloudwatch_event_target" "glue_job_failure" { + rule = aws_cloudwatch_event_rule.glue_job_failure.name + target_id = "PublishMetric" + arn = "arn:aws:events:${var.region}:${var.account_id}:api-destination/cloudwatch-metrics" + + input_transformer { + input_paths = { + jobName = "$.detail.jobName" + state = "$.detail.state" + } + input_template = jsonencode({ + MetricData = [{ + MetricName = local.glue_job_failure_metric_name + Value = 1 + Unit = "Count" + Dimensions = [{ + Name = "JobName" + Value = "" + }] + }] + Namespace = local.data_lake_namespace + }) + } +} diff --git a/terragrunt/aws/alarms/locals.tf b/terragrunt/aws/alarms/locals.tf index 98da5d1..154bd09 100644 --- a/terragrunt/aws/alarms/locals.tf +++ b/terragrunt/aws/alarms/locals.tf @@ -1,3 +1,6 @@ locals { + data_lake_namespace = "data-lake" glue_crawler_metric_filter_error_pattern = "ERROR" + glue_crawler_error_metric_name = "glue-crawler-error" + glue_job_failure_metric_name = "glue-job-failure" } \ No newline at end of file diff --git a/terragrunt/aws/glue/etl/README.md b/terragrunt/aws/glue/etl/README.md new file mode 100644 index 0000000..13273b5 --- /dev/null +++ b/terragrunt/aws/glue/etl/README.md @@ -0,0 +1,4 @@ +# Glue ETL job exports +There is not currently a way to manage visual Glue ETL jobs with Terraform in a way that allows them to be edited in the console. + +The exports in this folder can be used to recreate our ETL jobs using the console or CLI if needed. \ No newline at end of file