From ba72432b3f1a0b2aba2217b508084a14e6d4ac75 Mon Sep 17 00:00:00 2001 From: Hamza Jugon <104994559+HamzaJugon@users.noreply.github.com> Date: Fri, 10 Jan 2025 12:45:30 +0000 Subject: [PATCH] Feature/splunk configuration (#292) --- CHANGELOG.md | 4 +++ VARIABLES.md | 46 ++++++++++++++++------------- templates.tf | 38 ++++++++++++++++-------- templates/apiary-hms-readonly.json | 25 ++++++++++++++-- templates/apiary-hms-readwrite.json | 23 +++++++++++++-- variables.tf | 25 ++++++++++++++++ 6 files changed, 122 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cfc00b9..14de3fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [7.9.0] - 2025-01-09 +### Added +- Enabled Splunk for log forwarding and implemented health checks for ECS HMS. + ## [7.8.0] - 2024-12-12 ### Added - Hive databases backed by S3 can now have versioning enabled. diff --git a/VARIABLES.md b/VARIABLES.md index bb95ff2..ec44648 100644 --- a/VARIABLES.md +++ b/VARIABLES.md @@ -21,8 +21,8 @@ | apiary\_producer\_iamroles | AWS IAM roles allowed write access to managed Apiary S3 buckets. | `map(any)` | `{}` | no | | apiary\_rds\_additional\_sg | Comma-separated string containing additional security groups to attach to RDS. | `list(any)` | `[]` | no | | apiary\_shared\_schemas | Schema names which are accessible from read-only metastore, default is all schemas. | `list(any)` | `[]` | no | -| apiary\_tags | Common tags that are added to all resources. | `map(any)` | n/a | yes | -| apiary\_extra\_tags\_s3 | Extra tags that are added to apiary_s3_logs_bucket. | `map(any)` | n/a | no | +| apiary\_tags | Common tags that are added to all resources. | `map(any)` | n/a | yes | +| apiary\_extra\_tags\_s3 | Extra tags that are added to apiary_s3_logs_bucket. | `map(any)` | n/a | no | | atlas\_cluster\_name | Name of the Atlas cluster where metastore plugin will send DDL events. Defaults to `var.instance_name` if not set. | `string` | `""` | no | | atlas\_kafka\_bootstrap\_servers | Kafka instance url. | `string` | `""` | no | | aws\_region | AWS region. | `string` | n/a | yes | @@ -123,25 +123,29 @@ | system\_schema\_customer\_accounts | AWS account IDs allowed to access system database. | `list(string)` | `[]` | no | | system\_schema\_name | Name for the internal system database | `string` | `"apiary_system"` | no | | table\_param\_filter | A regular expression for selecting necessary table parameters for the SNS listener. If the value isn't set, then no table parameters are selected. | `string` | `""` | no | -| vpc\_id | VPC ID. | `string` | n/a | yes | -| enable\_dashboard | make EKS & ECS dashboard optional | `bool` | true | no | -| rds\_family | RDS Family | `string` | aurora5.6 | no | -| datadog\_metrics\_enabled | Enable Datadog metrics for HMS | `bool` | false | no | -| datadog\_metrics\_hms\_readwrite\_readonly | Prometheus Metrics sent to datadog | list(string) | ["metrics_classloading_loaded_value","metrics_threads_count_value","metrics_memory_heap_max_value","metrics_init_total_count_tables_value","metrics_init_total_count_dbs_value","metrics_memory_heap_used_value","metrics_init_total_count_partitions_value"] | no | -| datadog_metrics_port | Port in which metrics will be send for Datadog | string | 8080 | no | -| datadog\_key\_secret\_name | Name of the secret containing the DataDog API key. This needs to be created manually in AWS secrets manager. This is only applicable to ECS deployments. | string | null | no | -| datadog\_agent\_version | Version of the Datadog Agent running in the ECS cluster. This is only applicable to ECS deployments. | string | 7.50.3-jmx | no | -| datadog\_agent\_enabled | Whether to include the datadog-agent container. This is only applicable to ECS deployments. | string | false | no | -| enable\_tcp\_keepalive | tcp_keepalive settings on HMS pods. To use this you need to enable the ability to cahnge sysctl settings on your kubernetes cluster. For EKS you need to allow this on your cluster (https://kubernetes.io/docs/tasks/administer-cluster/sysctl-cluster/ check EKS version for details). If your EKS version is below 1.24 you need to create a PodSecurityPolicy allowing the following sysctls "net.ipv4.tcp_keepalive_time", "net.ipv4.tcp_keepalive_intvl","net.ipv4.tcp_keepalive_probes" and a ClusterRole + Rolebinding for the service account running the HMS pods or all services accounts in the namespace where Apiary is running so that kubernetes can apply the tcp)keepalive configuration. For EKS 1.25 and above check this https://kubernetes.io/blog/2022/08/23/kubernetes-v1-25-release/#pod-security-changes. Also see tcp_keepalive_* variables. | bool | false | no | -| tcp\_keepalive\_time | Sets net.ipv4.tcp_keepalive_time (seconds). | number | `200` | no | -| tcp\_keepalive\_intvl | Sets net.ipv4.tcp_keepalive_intvl (seconds) | number | `30` | no | -| tcp\_keepalive\_probes | Sets net.ipv4.tcp_keepalive_probes (seconds) | number | `2` | no | -| ecs\_platform\_version | ECS Service Platform Version | `string` | `"LATEST"` -| ecs\_requires\_compatibilities | ECS task definition requires compatibilities. | `list(string)` | `["EC2", "FARGATE"]` | no | -| hms\_ecs\_metrics\_readonly\_namespace | ECS readwrite metrics namespace | `string` | `hmsreadonlylegacy` | no | -| hms\_ecs\_metrics\_readwrite\_namespace | ECS readonly metrics namespace | `string` | `hmsreadwritelegacy` | no | -| hms\_k8s\_metrics\_readonly\_namespace | K8s readwrite metrics namespace | `string` | `hms_readonly` | no | -| s3\_versioning\_expiration\_days | Number of days (TTL) before objects are expired. Bucket need to have versioning enabled. | `number` | `7` | no | +| vpc\_id | VPC ID. | `string` | `n/a` | yes | +| enable\_dashboard | make EKS & ECS dashboard optional | `bool` | `true` | no | +| rds\_family | RDS Family | `string` | `aurora5.6` | no | +| datadog\_metrics\_enabled | Enable Datadog metrics for HMS | `bool` | `false` | no | +| datadog\_metrics\_hms\_readwrite\_readonly | Prometheus Metrics sent to datadog | `list(string)` | ["metrics_classloading_loaded_value","metrics_threads_count_value","metrics_memory_heap_max_value","metrics_init_total_count_tables_value","metrics_init_total_count_dbs_value","metrics_memory_heap_used_value","metrics_init_total_count_partitions_value"] | no | +| datadog_metrics_port | Port in which metrics will be send for Datadog | `string` | `8080` | no | +| datadog\_key\_secret\_name | Name of the secret containing the DataDog API key. This needs to be created manually in AWS secrets manager. This is only applicable to ECS deployments. | `string` | `null` | no | +| datadog\_agent\_version | Version of the Datadog Agent running in the ECS cluster. This is only applicable to ECS deployments. | `string` | `7.50.3-jmx` | no | +| datadog\_agent\_enabled | Whether to include the datadog-agent container. This is only applicable to ECS deployments. | `string` | `false` | no | +| enable\_tcp\_keepalive | tcp_keepalive settings on HMS pods. To use this you need to enable the ability to cahnge sysctl settings on your kubernetes cluster. For EKS you need to allow this on your cluster (https://kubernetes.io/docs/tasks/administer-cluster/sysctl-cluster/ check EKS version for details). If your EKS version is below 1.24 you need to create a PodSecurityPolicy allowing the following sysctls "net.ipv4.tcp_keepalive_time", "net.ipv4.tcp_keepalive_intvl","net.ipv4.tcp_keepalive_probes" and a ClusterRole + Rolebinding for the service account running the HMS pods or all services accounts in the namespace where Apiary is running so that kubernetes can apply the tcp)keepalive configuration. For EKS 1.25 and above check this https://kubernetes.io/blog/2022/08/23/kubernetes-v1-25-release/#pod-security-changes. Also see tcp_keepalive_* variables. | `bool` | `false` | no | +| tcp\_keepalive\_time | Sets net.ipv4.tcp_keepalive_time (seconds). | `number` | `200` | no | +| tcp\_keepalive\_intvl | Sets net.ipv4.tcp_keepalive_intvl (seconds) | `number` | `30` | no | +| tcp\_keepalive\_probes | Sets net.ipv4.tcp_keepalive_probes (seconds) | `number` | `2` | no | +| ecs\_platform\_version | ECS Service Platform Version | `string` | `"LATEST"` | no | +| ecs\_requires\_compatibilities | ECS task definition requires compatibilities. | `list(string)` | `["EC2", "FARGATE"]` | no | +| hms\_ecs\_metrics\_readonly\_namespace | ECS readwrite metrics namespace | `string` | `hmsreadonlylegacy` | no | +| hms\_ecs\_metrics\_readwrite\_namespace | ECS readonly metrics namespace | `string` | `hmsreadwritelegacy` | no | +| hms\_k8s\_metrics\_readonly\_namespace | K8s readwrite metrics namespace | `string` | `hms_readonly` | no | +| s3\_versioning\_expiration\_days | Number of days (TTL) before objects are expired. Bucket need to have versioning enabled. | `number` | `7` | no | +| enable_splunk_logging | Enable sending longs to Splunk. When enabling we also need splunk_hec_token, splunk_hec_host and splunk_index. | `bool` | false | no | +| splunk_hec_token | The token used for authentication with the Splunk HTTP Event Collector (HEC). This is required for sending logs to Splunk. Compatible with both EC2 and FARGATE ECS task definitions. | `string` | | no | +| splunk_hec_host | The hostname or URL of the Splunk HTTP Event Collector (HEC) endpoint to which logs will be sent. | `string` | | no | +| splunk_hec_index | The index in Splunk where logs will be stored. This is used to organize and manage logs within Splunk. | `string` | | no | ### apiary_assume_roles diff --git a/templates.tf b/templates.tf index c8d5aa7..2c8be71 100644 --- a/templates.tf +++ b/templates.tf @@ -57,23 +57,29 @@ locals{ s3_enable_logs = local.enable_apiary_s3_log_hive ? "1" : "" # Template vars for init container - init_container_enabled = var.external_database_host == "" ? true : false - mysql_permissions = "ALL" - mysql_master_cred_arn = var.external_database_host == "" ? aws_secretsmanager_secret.apiary_mysql_master_credentials[0].arn : null - mysql_user_cred_arn = data.aws_secretsmanager_secret.db_rw_user.arn + init_container_enabled = var.external_database_host == "" ? true : false + mysql_permissions = "ALL" + mysql_master_cred_arn = var.external_database_host == "" ? aws_secretsmanager_secret.apiary_mysql_master_credentials[0].arn : null + mysql_user_cred_arn = data.aws_secretsmanager_secret.db_rw_user.arn # Datadog variables - datadog_secret_key = length(var.datadog_key_secret_name) > 0 ? chomp(data.external.datadog_key[0].result["api_key"]) : "" - wd_instance_type = var.hms_instance_type - metrics_port = var.datadog_metrics_port - datadog_agent_version = var.datadog_agent_version - datadog_agent_enabled = var.datadog_agent_enabled - datadog_tags = local.datadog_tags - tcp_keepalive_time = var.tcp_keepalive_time - tcp_keepalive_intvl = var.tcp_keepalive_intvl - tcp_keepalive_probes = var.tcp_keepalive_probes + datadog_secret_key = length(var.datadog_key_secret_name) > 0 ? chomp(data.external.datadog_key[0].result["api_key"]) : "" + wd_instance_type = var.hms_instance_type + metrics_port = var.datadog_metrics_port + datadog_agent_version = var.datadog_agent_version + datadog_agent_enabled = var.datadog_agent_enabled + datadog_tags = local.datadog_tags + tcp_keepalive_time = var.tcp_keepalive_time + tcp_keepalive_intvl = var.tcp_keepalive_intvl + tcp_keepalive_probes = var.tcp_keepalive_probes hms_metrics = local.hms_metrics hms_metrics_type_overrides = local.hms_metrics_type_overrides + + // Splunk configuration + enable_splunk_logging = var.enable_splunk_logging + splunk_hec_host = var.splunk_hec_host + splunk_hec_token = var.splunk_hec_token + splunk_hec_index = var.splunk_hec_index }) hms_readonly_template = templatefile("${path.module}/templates/apiary-hms-readonly.json", { @@ -131,5 +137,11 @@ locals{ hms_metrics_namespace = "${var.hms_ecs_metrics_readonly_namespace}" hms_metrics = local.hms_metrics hms_metrics_type_overrides = local.hms_metrics_type_overrides + + // Splunk configuration + enable_splunk_logging = var.enable_splunk_logging + splunk_hec_host = var.splunk_hec_host + splunk_hec_token = var.splunk_hec_token + splunk_hec_index = var.splunk_hec_index }) } diff --git a/templates/apiary-hms-readonly.json b/templates/apiary-hms-readonly.json index ea7a36b..13f4fda 100644 --- a/templates/apiary-hms-readonly.json +++ b/templates/apiary-hms-readonly.json @@ -66,12 +66,21 @@ "value": "${tcp_keepalive_probes}" } ], - "logConfiguration": { - "logDriver": "awslogs", - "options": { + "logConfiguration": { + "logDriver": "${enable_splunk_logging ? "splunk" : "awslogs"}", + "options": ${enable_splunk_logging ? + jsonencode({ + "splunk-token": "${splunk_hec_token}", + "splunk-url": "${splunk_hec_host}", + "splunk-source": "apiary-hms-readonly-logs", + "splunk-sourcetype": "hms-logs", + "splunk-index": "${splunk_hec_index}" + }) : + jsonencode({ "awslogs-group": "${loggroup}", "awslogs-region": "${region}", "awslogs-stream-prefix": "/" + }) } }, "dockerLabels": { @@ -93,6 +102,16 @@ } ], %{ endif } + "healthCheck": { + "command": [ + "CMD-SHELL", + "curl -f http://localhost:8080/actuator/health || exit 1" + ], + "interval": 30, + "timeout": 5, + "retries": 3, + "startPeriod": 10 + }, "environment":[ { "name": "MYSQL_DB_HOST", diff --git a/templates/apiary-hms-readwrite.json b/templates/apiary-hms-readwrite.json index 64220d2..79dd26c 100644 --- a/templates/apiary-hms-readwrite.json +++ b/templates/apiary-hms-readwrite.json @@ -67,11 +67,20 @@ } ], "logConfiguration": { - "logDriver": "awslogs", - "options": { + "logDriver": "${enable_splunk_logging ? "splunk" : "awslogs"}", + "options": ${enable_splunk_logging ? + jsonencode({ + "splunk-token": "${splunk_hec_token}", + "splunk-url": "${splunk_hec_host}", + "splunk-source": "apiary-hms-readwrite-logs", + "splunk-sourcetype": "hms-logs", + "splunk-index": "${splunk_hec_index}" + }) : + jsonencode({ "awslogs-group": "${loggroup}", "awslogs-region": "${region}", "awslogs-stream-prefix": "/" + }) } }, "dockerLabels": { @@ -93,6 +102,16 @@ } ], %{ endif } + "healthCheck": { + "command": [ + "CMD-SHELL", + "curl -f http://localhost:8080/actuator/health || exit 1" + ], + "interval": 30, + "timeout": 5, + "retries": 3, + "startPeriod": 10 + }, "environment":[ { "name": "MYSQL_DB_HOST", diff --git a/variables.tf b/variables.tf index e9d82a2..4ff64db 100644 --- a/variables.tf +++ b/variables.tf @@ -1124,3 +1124,28 @@ EOF })) default = [] } + +variable "enable_splunk_logging" { + description = "Enable sending longs to Splunk. When enabling we also need splunk_hec_token, splunk_hec_host and splunk_index." + type = bool + default = false +} + +variable "splunk_hec_token" { + description = "The token used for authentication with the Splunk HTTP Event Collector (HEC). This is required for sending logs to Splunk. Compatible with both EC2 and FARGATE ECS task definitions." + type = string + default = "" +} + +variable "splunk_hec_host" { + description = "The hostname or URL of the Splunk HTTP Event Collector (HEC) endpoint to which logs will be sent." + type = string + default = "" +} + +variable "splunk_hec_index" { + description = "The index in Splunk where logs will be stored. This is used to organize and manage logs within Splunk." + type = string + default = "" +} +