Skip to content

Commit

Permalink
feat: add CloudWatch alarm for failed Glue jobs (#23)
Browse files Browse the repository at this point in the history
Add an EventBridge rule and custom CloudWatch metric to capture
failed Glue jobs.  A new alarm has also been added to trigger when a
failure occurs.
  • Loading branch information
patheard authored Nov 14, 2024
1 parent 4f97f30 commit 82b02e8
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 2 deletions.
24 changes: 22 additions & 2 deletions terragrunt/aws/alarms/alarms.tf
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ resource "aws_cloudwatch_log_metric_filter" "glue_crawler_error" {
log_group_name = var.glue_crawler_log_group_name

metric_transformation {
name = "glue-crawler-error"
namespace = "data-lake"
name = local.glue_crawler_error_metric_name
namespace = local.data_lake_namespace
value = "1"
default_value = "0"
unit = "Count"
Expand All @@ -31,6 +31,26 @@ resource "aws_cloudwatch_metric_alarm" "glue_crawler_error" {
ok_actions = [aws_sns_topic.cloudwatch_ok_action.arn]
}

resource "aws_cloudwatch_metric_alarm" "glue_job_failures" {
alarm_name = "glue-job-failures"
alarm_description = "Failed Glue jobs in a 1 minute period."
comparison_operator = "GreaterThanThreshold"
evaluation_periods = "1"
metric_name = local.glue_job_failure_metric_name
namespace = local.data_lake_namespace
period = "60"
statistic = "Sum"
threshold = "0"
treat_missing_data = "notBreaching"

alarm_actions = [aws_sns_topic.cloudwatch_alarm_action.arn]
ok_actions = [aws_sns_topic.cloudwatch_ok_action.arn]

dimensions = {
JobName = "*"
}
}

#
# Log Insight queries
#
Expand Down
37 changes: 37 additions & 0 deletions terragrunt/aws/alarms/eventbridge.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
resource "aws_cloudwatch_event_rule" "glue_job_failure" {
name = "glue-job-failures"
description = "Capture Glue job failures and timeouts"

event_pattern = jsonencode({
source = ["aws.glue"]
detail-type = ["Glue Job State Change"]
detail = {
state = ["FAILED", "TIMEOUT", "ERROR"]
}
})
}

resource "aws_cloudwatch_event_target" "glue_job_failure" {
rule = aws_cloudwatch_event_rule.glue_job_failure.name
target_id = "PublishMetric"
arn = "arn:aws:events:${var.region}:${var.account_id}:api-destination/cloudwatch-metrics"

input_transformer {
input_paths = {
jobName = "$.detail.jobName"
state = "$.detail.state"
}
input_template = jsonencode({
MetricData = [{
MetricName = local.glue_job_failure_metric_name
Value = 1
Unit = "Count"
Dimensions = [{
Name = "JobName"
Value = "<jobName>"
}]
}]
Namespace = local.data_lake_namespace
})
}
}
3 changes: 3 additions & 0 deletions terragrunt/aws/alarms/locals.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
locals {
data_lake_namespace = "data-lake"
glue_crawler_metric_filter_error_pattern = "ERROR"
glue_crawler_error_metric_name = "glue-crawler-error"
glue_job_failure_metric_name = "glue-job-failure"
}
4 changes: 4 additions & 0 deletions terragrunt/aws/glue/etl/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Glue ETL job exports
There is not currently a way to manage visual Glue ETL jobs with Terraform in a way that allows them to be edited in the console.

The exports in this folder can be used to recreate our ETL jobs using the console or CLI if needed.

0 comments on commit 82b02e8

Please sign in to comment.