Skip to content

Commit

Permalink
[EC-357] Migrate core monitoring folder to new TF config (#1148)
Browse files Browse the repository at this point in the history
  • Loading branch information
Krusty93 authored Sep 4, 2024
1 parent f285f2f commit cc6c3cf
Show file tree
Hide file tree
Showing 13 changed files with 849 additions and 457 deletions.
15 changes: 12 additions & 3 deletions src/common/_modules/global/modules/dns/outputs.tf
Original file line number Diff line number Diff line change
@@ -1,19 +1,28 @@
output "public_dns_zones" {
value = {
io = {
name = azurerm_dns_zone.io_pagopa_it.name
name = azurerm_dns_zone.io_pagopa_it.name
api = trimsuffix(azurerm_dns_a_record.api_io_pagopa_it.fqdn, ".")
api_app = trimsuffix(azurerm_dns_a_record.api_app_io_pagopa_it.fqdn, ".")
api_web = trimsuffix(azurerm_dns_a_record.api_web_io_pagopa_it.fqdn, ".")
api_mtls = trimsuffix(azurerm_dns_a_record.api_mtls_io_pagopa_it.fqdn, ".")
continua = trimsuffix(azurerm_dns_a_record.continua_io_pagopa_it.fqdn, ".")
}

firmaconio_selfcare_pagopa_it = {
name = azurerm_dns_zone.io_pagopa_it.name
}

io_italia_it = {
name = azurerm_dns_zone.io_italia_it.name
name = azurerm_dns_zone.io_italia_it.name
developer_portal_backend = trimsuffix(azurerm_dns_a_record.developerportal_backend_io_italia_it.fqdn, ".")
api = trimsuffix(azurerm_dns_a_record.api_io_italia_it.fqdn, ".")
app_backend = trimsuffix(azurerm_dns_a_record.app_backend_io_italia_it.fqdn, ".")
}

io_selfcare_pagopa_it = {
name = azurerm_dns_zone.io_pagopa_it.name
api = trimsuffix(azurerm_dns_a_record.api_io_selfcare_pagopa_it.fqdn, ".")
}

ioweb_it = {
Expand All @@ -26,4 +35,4 @@ output "private_dns_zones" {
value = {
servicebus = azurerm_private_dns_zone.privatelink_servicebus
}
}
}
82 changes: 82 additions & 0 deletions src/common/_modules/monitoring/ag.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
resource "azurerm_monitor_action_group" "error" {
resource_group_name = var.resource_group_common
name = try(local.nonstandard[var.location_short].ag_error, "${var.project}-error-ag-01")
short_name = try(local.nonstandard[var.location_short].ag_error, "${var.project}-error-ag-01")

email_receiver {
name = "email"
email_address = data.azurerm_key_vault_secret.alert_error_notification_email.value
use_common_alert_schema = true
}

email_receiver {
name = "slack"
email_address = data.azurerm_key_vault_secret.alert_error_notification_slack.value
use_common_alert_schema = true
}

webhook_receiver {
name = "sendtoopsgenie"
service_uri = data.azurerm_key_vault_secret.alert_error_notification_opsgenie.value
use_common_alert_schema = true
}

tags = var.tags
}

resource "azurerm_monitor_action_group" "quarantine_error" {
resource_group_name = var.resource_group_common
name = try(local.nonstandard[var.location_short].ag_quarantine_error, "${var.project}-quarantineerror-ag-01")
short_name = try(local.nonstandard[var.location_short].ag_quarantine_error_short, "${var.project}-qerr-ag-01")

email_receiver {
name = "slack"
email_address = data.azurerm_key_vault_secret.alert_quarantine_error_notification_slack.value
use_common_alert_schema = true
}

tags = var.tags
}

# the action group that publish to the channel of the trial-system project
resource "azurerm_monitor_action_group" "trial_system_error" {
resource_group_name = var.resource_group_common
name = try(local.nonstandard[var.location_short].ag_ts_error, "${var.project}-ts-error-ag-01")
short_name = try(local.nonstandard[var.location_short].ag_ts_error_short, "${var.project}-ts-error-ag-01")

email_receiver {
name = "slack"
email_address = data.azurerm_key_vault_secret.alert_error_trial_slack.value
use_common_alert_schema = true
}

tags = var.tags
}

resource "azurerm_monitor_action_group" "email" {
name = try(local.nonstandard[var.location_short].email_pagopa, "${var.project}-email-ag-01")
resource_group_name = var.resource_group_common
short_name = try(local.nonstandard[var.location_short].email_pagopa, "${var.project}-email-ag-01")

email_receiver {
name = "sendtooperations"
email_address = data.azurerm_key_vault_secret.monitor_notification_email.value
use_common_alert_schema = true
}

tags = var.tags
}

resource "azurerm_monitor_action_group" "slack" {
name = try(local.nonstandard[var.location_short].slack_pagopa, "${var.project}-slack-ag-01")
resource_group_name = var.resource_group_common
short_name = try(local.nonstandard[var.location_short].slack_pagopa, "${var.project}-slack-ag-01")

email_receiver {
name = "sendtoslack"
email_address = data.azurerm_key_vault_secret.monitor_notification_slack_email.value
use_common_alert_schema = true
}

tags = var.tags
}
11 changes: 11 additions & 0 deletions src/common/_modules/monitoring/appi.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
resource "azurerm_application_insights" "appi" {
name = try(local.nonstandard[var.location_short].appi, "${var.project}-appi-01")
location = var.location
resource_group_name = var.resource_group_common
disable_ip_masking = true
application_type = "other"

workspace_id = azurerm_log_analytics_workspace.log.id

tags = var.tags
}
35 changes: 35 additions & 0 deletions src/common/_modules/monitoring/data.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
data "azurerm_key_vault_secret" "monitor_notification_slack_email" {
name = "monitor-notification-slack-email"
key_vault_id = var.kv_id
}

data "azurerm_key_vault_secret" "monitor_notification_email" {
name = "monitor-notification-email"
key_vault_id = var.kv_id
}

data "azurerm_key_vault_secret" "alert_error_notification_email" {
name = "alert-error-notification-email"
key_vault_id = var.kv_id
}

data "azurerm_key_vault_secret" "alert_error_notification_slack" {
name = "alert-error-notification-slack"
key_vault_id = var.kv_id
}

data "azurerm_key_vault_secret" "alert_quarantine_error_notification_slack" {
name = "alert-error-quarantine-notification-slack"
key_vault_id = var.kv_id
}

data "azurerm_key_vault_secret" "alert_error_notification_opsgenie" {
name = "alert-error-notification-opsgenie"
key_vault_id = var.kv_id
}

# point to the channel of the trial-system project
data "azurerm_key_vault_secret" "alert_error_trial_slack" {
name = "alert-error-trial-slack"
key_vault_id = var.kv_id
}
22 changes: 22 additions & 0 deletions src/common/_modules/monitoring/locals.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
locals {
ag_formatted_project = replace(var.project, "-", "")

nonstandard = {
weu = {

log = "${var.project}-law-common"
appi = "${var.project}-ai-common"

ag_error = "${local.ag_formatted_project}error"

ag_quarantine_error = "${local.ag_formatted_project}quarantineerror"
ag_quarantine_error_short = "${local.ag_formatted_project}qerr"

ag_ts_error = "${local.ag_formatted_project}trialsystemerror"
ag_ts_error_short = "${replace(var.project, "-", "")}tserr"

email_pagopa = "EmailPagoPA"
slack_pagopa = "SlackPagoPA"
}
}
}
10 changes: 10 additions & 0 deletions src/common/_modules/monitoring/log.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
resource "azurerm_log_analytics_workspace" "log" {
name = try(local.nonstandard[var.location_short].log, "${var.project}-log-01")
location = var.location
resource_group_name = var.resource_group_common
sku = "PerGB2018"
retention_in_days = "90"
daily_quota_gb = "-1"

tags = var.tags
}
43 changes: 43 additions & 0 deletions src/common/_modules/monitoring/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
variable "project" {
type = string
description = "IO prefix, short environment and short location"
}

variable "location" {
type = string
description = "Azure region"
}

variable "location_short" {
type = string
description = "Azure region short name"
}

variable "tags" {
type = map(any)
description = "Resource tags"
}

variable "resource_group_common" {
type = string
description = "Name of common resource group"
default = null
}

variable "kv_id" {
type = string
description = "Id of the IO KeyVault"
}

variable "test_urls" {
type = list(object({
name = string
host = string
path = string
frequency = number
http_status = number
ssl_cert_remaining_lifetime_check = number
enabled = optional(bool, true)
ssl_enabled = optional(bool, true)
}))
}
97 changes: 97 additions & 0 deletions src/common/_modules/monitoring/web_tests.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
resource "azurerm_application_insights_standard_web_test" "web_tests" {
for_each = { for v in var.test_urls : v.name => v if v != null }

name = format("%s-test-%s", each.value.name, azurerm_application_insights.appi.name)
resource_group_name = azurerm_application_insights.appi.resource_group_name
location = azurerm_application_insights.appi.location
application_insights_id = azurerm_application_insights.appi.id
geo_locations = ["emea-nl-ams-azr"] # https://learn.microsoft.com/en-us/previous-versions/azure/azure-monitor/app/monitor-web-app-availability#location-population-tags
frequency = each.value.frequency
enabled = each.value.enabled
retry_enabled = true

request {
url = format("https://%s%s", each.value.host, each.value.path)
follow_redirects_enabled = false
parse_dependent_requests_enabled = false
}

validation_rules {
expected_status_code = each.value.http_status
ssl_cert_remaining_lifetime = each.value.ssl_enabled ? each.value.ssl_cert_remaining_lifetime_check : null
ssl_check_enabled = each.value.ssl_enabled
}
}

resource "azurerm_monitor_metric_alert" "metric_alerts" {
for_each = { for v in var.test_urls : v.name => v if v != null }

name = format("%s-test-%s", each.value.name, azurerm_application_insights.appi.name)
resource_group_name = azurerm_application_insights.appi.resource_group_name
severity = 1
scopes = [
azurerm_application_insights.appi.id,
azurerm_application_insights_standard_web_test.web_tests[each.value.name].id
]
description = "Web availability check alert triggered when it fails. Runbook: https://pagopa.atlassian.net/wiki/spaces/IC/pages/762347521/Web+Availability+Test+-+TLS+Probe+Check"
auto_mitigate = false

application_insights_web_test_location_availability_criteria {
web_test_id = azurerm_application_insights_standard_web_test.web_tests[each.value.name].id
component_id = azurerm_application_insights.appi.id
failed_location_count = 1
}

action {
action_group_id = azurerm_monitor_action_group.error.id
}
}

resource "azurerm_monitor_scheduled_query_rules_alert_v2" "mailup" {
name = "[SEND.MAILUP.COM] Many Failures"
resource_group_name = azurerm_application_insights.appi.resource_group_name
location = azurerm_application_insights.appi.location

display_name = "[SEND.MAILUP.COM] Many Failures"

criteria {
query = <<-QUERY
let timeGrain=5m;
let dataset=dependencies
// additional filters can be applied here
| where client_Type != "Browser"
| where target contains "send.mailup.com"
| where success == false;
dataset
QUERY

operator = "GreaterThan"
threshold = 10
time_aggregation_method = "Count"

failing_periods {
minimum_failing_periods_to_trigger_alert = 1
number_of_evaluation_periods = 1
}
}

window_duration = "PT30M"
evaluation_frequency = "PT5M"
severity = 1

scopes = [
azurerm_application_insights.appi.id,
]

description = "Check in Application Insight - Dependencies the mailup calls. Runbook: https://pagopa.atlassian.net/wiki/spaces/IC/pages/777650829/MailUp+Communication+Failures"
enabled = true
auto_mitigation_enabled = false

action {
action_groups = [
azurerm_monitor_action_group.error.id,
]
}

tags = var.tags
}
1 change: 1 addition & 0 deletions src/common/prod/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
|------|--------|---------|
| <a name="module_event_hubs_weu"></a> [event\_hubs\_weu](#module\_event\_hubs\_weu) | ../_modules/event_hubs | n/a |
| <a name="module_global"></a> [global](#module\_global) | ../_modules/global | n/a |
| <a name="module_monitoring_weu"></a> [monitoring\_weu](#module\_monitoring\_weu) | ../_modules/monitoring | n/a |

## Resources

Expand Down
Loading

0 comments on commit cc6c3cf

Please sign in to comment.