diff --git a/.github/workflows/k8s-apply.yaml b/.github/workflows/k8s-apply.yaml index cb3367f..153674b 100644 --- a/.github/workflows/k8s-apply.yaml +++ b/.github/workflows/k8s-apply.yaml @@ -130,3 +130,11 @@ jobs: ecs_cluster_name: ${{ secrets.ECS_CLUSTER_NAME }} pat_token: ${{ secrets.BOT_TOKEN }} environment: ${{ inputs.environment }} + + tf_apply: + secrets: inherit + uses: ./.github/workflows/tf-apply.yaml + with: + environment: ${{ inputs.environment }} + timeout_seconds: 300 + \ No newline at end of file diff --git a/.github/workflows/tf-apply.yaml b/.github/workflows/tf-apply.yaml new file mode 100644 index 0000000..feeb2ce --- /dev/null +++ b/.github/workflows/tf-apply.yaml @@ -0,0 +1,105 @@ +name: TF Apply + +on: + workflow_call: + inputs: + environment: + description: 'Environment to run apply against' + required: true + type: string + timeout_seconds: + description: 'Terraform apply wait timeout in seconds' + required: true + type: number + secrets: + AWS_REGION: + required: true + TERRAFORM_IAM_ROLE_ARN: + required: true + +defaults: + run: + shell: bash + +jobs: + workflow_setup: + name: Setup steps + runs-on: ubuntu-latest + environment: ${{ inputs.environment }} + outputs: + microservices: ${{ steps.set-outputs.outputs.microservices }} + cronjobs: ${{ steps.set-outputs.outputs.cronjobs }} + steps: + - name: Checkout + id: checkout + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 + + - name: Normalize environment + id: norm_env + run: | + GH_ENV="${{ inputs.environment }}" + NORM_ENV="$(echo "$GH_ENV" | sed -e 's/-tf//')" + echo "NORM_ENV=$NORM_ENV" >> $GITHUB_ENV + + - id: set-outputs + run: | + echo "microservices=$(find microservices -type f -path "*/$NORM_ENV/values.yaml" -exec dirname {} \; | awk -F'/' '{print $2}' | jq -R -s -c 'split("\n")[:-1]')" >> "$GITHUB_OUTPUT" + echo "cronjobs=$(find jobs -type f -path "*/$NORM_ENV/values.yaml" -exec dirname {} \; | awk -F'/' '{print $2}' | jq -R -s -c 'split("\n")[:-1]')" >> "$GITHUB_OUTPUT" + + terraform_apply_monitoring: + name: Terraform Apply Monitoring + needs: workflow_setup + runs-on: ubuntu-latest + timeout-minutes: 3 + environment: ${{ inputs.environment }} + + steps: + - name: Checkout + uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c + + - name: Create microservices JSON + working-directory: terraform/k8s-monitoring/ + run: | + mkdir assets + echo '${{ needs.workflow_setup.outputs.microservices }}' | jq -c 'sort' > assets/microservices-list.json + + - name: Create cronjobs JSON + working-directory: terraform/k8s-monitoring/ + run: | + echo '${{ needs.workflow_setup.outputs.cronjobs }}' | jq -c 'sort' > assets/cronjobs-list.json + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@67fbcbb121271f7775d2e7715933280b06314838 + with: + role-to-assume: ${{ secrets.TERRAFORM_IAM_ROLE_ARN }} + aws-region: ${{ secrets.AWS_REGION }} + + - name: Read Terraform version + id: read-version + working-directory: terraform/ + run: | + echo "TERRAFORM_VERSION=$(cat ./.terraform-version)" >> $GITHUB_ENV + + - name: Setup Terraform + uses: hashicorp/setup-terraform@633666f66e0061ca3b725c73b2ec20cd13a8fdd1 + with: + terraform_version: ${{ env.TERRAFORM_VERSION }} + + - name: Normalize environment + id: norm_env + run: | + GH_ENV="${{ inputs.environment }}" + NORM_ENV="$(echo "$GH_ENV" | sed -e 's/-tf//')" + echo "NORM_ENV=$NORM_ENV" >> $GITHUB_ENV + + - name: Terraform Init + id: terraform_init + working-directory: terraform/k8s-monitoring + run: | + ./terraform.sh init "$NORM_ENV" + + - name: Terraform Apply Monitoring + id: terraform_apply_monitoring + working-directory: terraform/k8s-monitoring + run: | + terraform apply -var-file="./env/$NORM_ENV/terraform.tfvars" -auto-approve diff --git a/.gitignore b/.gitignore index 5fb72c6..9d03ef6 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,25 @@ charts/ Chart.lock out* .DS_Store -**/*.compiled.yaml \ No newline at end of file +**/*.compiled.yaml + +# Local .terraform directories +**/.terraform/* + +# .tfstate files +*.tfstate +*.tfstate.* + +# Crash log files +crash.log + +# Ignore override files as they are usually used to override resources locally and so +# are not checked in +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Ignore JSON microservices list +terraform/k8s-monitoring/assets/microservices-list.json +terraform/k8s-monitoring/assets/cronjobs-list.json \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..7fa94d6 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,11 @@ +repos: + - repo: https://github.com/antonbabenko/pre-commit-terraform + rev: v1.96.2 + hooks: + - id: terraform_fmt + # TODO: issue when validating modules + # - id: terraform_validate + # args: + # - --init-args=-lockfile=readonly + # - --args=-json + # - --args=-no-color diff --git a/terraform/.terraform-version b/terraform/.terraform-version new file mode 100644 index 0000000..27f9cd3 --- /dev/null +++ b/terraform/.terraform-version @@ -0,0 +1 @@ +1.8.0 diff --git a/terraform/k8s-monitoring/.terraform.lock.hcl b/terraform/k8s-monitoring/.terraform.lock.hcl new file mode 100644 index 0000000..c8018c9 --- /dev/null +++ b/terraform/k8s-monitoring/.terraform.lock.hcl @@ -0,0 +1,63 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/hashicorp/aws" { + version = "5.46.0" + constraints = "~> 5.46.0" + hashes = [ + "h1:d0Mf33mbbQujZ/JaYkqmH5gZGvP+iEIWf9yBSiOwimE=", + "zh:05ae6180a7f23071435f6e5e59c19af0b6c5da42ee600c6c1568c8660214d548", + "zh:0d878d1565d5e57ce6b34ec5f04b28662044a50c999ec5770c374aa1f1020de2", + "zh:25ef1467af2514d8011c44759307445f7057836ff87dfe4503c3e1c9776d5c1a", + "zh:26c006df6200f0063b827aab05bec94f9f3f77848e82ed72e48a51d1170d1961", + "zh:37cdf4292649a10f12858622826925e18ad4eca354c31f61d02c66895eb91274", + "zh:4315b0433c2fc512666c74e989e2d95240934ef370bea1c690d36cb02d30c4ce", + "zh:75df0b3f631b78aeff1832cc77d99b527c2a5e79d40f7aac40bdc4a66124dac2", + "zh:90693d936c9a556d2bf945de4920ff82052002eb73139bd7164fafd02920f0ef", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:c9177ad09804c60fd2ed25950570407b6bdcdf0fcc309e1673b584f06a827fae", + "zh:ca8e8db24a4d62d92afd8d3d383b81a08693acac191a2e0a110fb46deeff56a3", + "zh:d5fa3a36e13957d63bfe9bbd6df0426a2422214403aac9f20b60c36f8d9ebec6", + "zh:e4ede44a112296c9cc77b15e439e41ee15c0e8b3a0dec94ae34df5ebba840e8b", + "zh:f2d4de8d8cde69caffede1544ebea74e69fcc4552e1b79ae053519a05c060706", + "zh:fc19e9266b1841d4a3aeefa8a5b5ad6988baed6540f85a373b6c2d0dc1ca5830", + ] +} + +provider "registry.terraform.io/hashicorp/external" { + version = "2.3.4" + hashes = [ + "h1:cCabxnWQ5fX1lS7ZqgUzsvWmKZw9FA7NRxAZ94vcTcc=", + "zh:037fd82cd86227359bc010672cd174235e2d337601d4686f526d0f53c87447cb", + "zh:0ea1db63d6173d01f2fa8eb8989f0809a55135a0d8d424b08ba5dabad73095fa", + "zh:17a4d0a306566f2e45778fbac48744b6fd9c958aaa359e79f144c6358cb93af0", + "zh:298e5408ab17fd2e90d2cd6d406c6d02344fe610de5b7dae943a58b958e76691", + "zh:38ecfd29ee0785fd93164812dcbe0664ebbe5417473f3b2658087ca5a0286ecb", + "zh:59f6a6f31acf66f4ea3667a555a70eba5d406c6e6d93c2c641b81d63261eeace", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:ad0279dfd09d713db0c18469f585e58d04748ca72d9ada83883492e0dd13bd58", + "zh:c69f66fd21f5e2c8ecf7ca68d9091c40f19ad913aef21e3ce23836e91b8cbb5f", + "zh:d4a56f8c48aa86fc8e0c233d56850f5783f322d6336f3bf1916e293246b6b5d4", + "zh:f2b394ebd4af33f343835517e80fc876f79361f4688220833bc3c77655dd2202", + "zh:f31982f29f12834e5d21e010856eddd19d59cd8f449adf470655bfd19354377e", + ] +} + +provider "registry.terraform.io/hashicorp/local" { + version = "2.5.2" + hashes = [ + "h1:IyFbOIO6mhikFNL/2h1iZJ6kyN3U00jgkpCLUCThAfE=", + "zh:136299545178ce281c56f36965bf91c35407c11897f7082b3b983d86cb79b511", + "zh:3b4486858aa9cb8163378722b642c57c529b6c64bfbfc9461d940a84cd66ebea", + "zh:4855ee628ead847741aa4f4fc9bed50cfdbf197f2912775dd9fe7bc43fa077c0", + "zh:4b8cd2583d1edcac4011caafe8afb7a95e8110a607a1d5fb87d921178074a69b", + "zh:52084ddaff8c8cd3f9e7bcb7ce4dc1eab00602912c96da43c29b4762dc376038", + "zh:71562d330d3f92d79b2952ffdda0dad167e952e46200c767dd30c6af8d7c0ed3", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:805f81ade06ff68fa8b908d31892eaed5c180ae031c77ad35f82cb7a74b97cf4", + "zh:8b6b3ebeaaa8e38dd04e56996abe80db9be6f4c1df75ac3cccc77642899bd464", + "zh:ad07750576b99248037b897de71113cc19b1a8d0bc235eb99173cc83d0de3b1b", + "zh:b9f1c3bfadb74068f5c205292badb0661e17ac05eb23bfe8bd809691e4583d0e", + "zh:cc4cbcd67414fefb111c1bf7ab0bc4beb8c0b553d01719ad17de9a047adff4d1", + ] +} diff --git a/terraform/k8s-monitoring/00-main.tf b/terraform/k8s-monitoring/00-main.tf new file mode 100644 index 0000000..69ce5b4 --- /dev/null +++ b/terraform/k8s-monitoring/00-main.tf @@ -0,0 +1,22 @@ +terraform { + required_version = "~> 1.8.0" + + backend "s3" {} + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.46.0" + } + } +} + +provider "aws" { + region = var.aws_region + + default_tags { + tags = var.tags + } +} + +data "aws_caller_identity" "current" {} \ No newline at end of file diff --git a/terraform/k8s-monitoring/10-cloudwatch.tf b/terraform/k8s-monitoring/10-cloudwatch.tf new file mode 100644 index 0000000..f8bd07e --- /dev/null +++ b/terraform/k8s-monitoring/10-cloudwatch.tf @@ -0,0 +1,3 @@ +data "external" "cloudwatch_log_metric_filters" { + program = ["aws", "logs", "describe-metric-filters", "--log-group-name", "${var.cloudwatch_log_group_name}", "--output", "json", "--query", "metricFilters[0].metricTransformations[0].{metricName: metricName, metricNamespace: metricNamespace}"] +} \ No newline at end of file diff --git a/terraform/k8s-monitoring/10-sns.tf b/terraform/k8s-monitoring/10-sns.tf new file mode 100644 index 0000000..aa959c2 --- /dev/null +++ b/terraform/k8s-monitoring/10-sns.tf @@ -0,0 +1,3 @@ +data "aws_sns_topic" "platform_alarms" { + name = var.sns_topic_name +} \ No newline at end of file diff --git a/terraform/k8s-monitoring/20-k8s-monitoring-cronjobs.tf b/terraform/k8s-monitoring/20-k8s-monitoring-cronjobs.tf new file mode 100644 index 0000000..b6ff7c4 --- /dev/null +++ b/terraform/k8s-monitoring/20-k8s-monitoring-cronjobs.tf @@ -0,0 +1,35 @@ +data "local_file" "cronjobs_list" { + filename = "${path.module}/assets/cronjobs-list.json" +} + +locals { + cronjobs_names = jsondecode(data.local_file.cronjobs_list.content) +} + +resource "aws_cloudwatch_metric_alarm" "cronjob_errors" { + for_each = toset(local.cronjobs_names) + + alarm_name = format("k8s-cronjob-%s-errors-%s", each.key, var.env) + alarm_description = format("Cronjob errors alarm for %s", each.key) + + alarm_actions = [data.aws_sns_topic.platform_alarms.arn] + + metric_name = try(data.external.cloudwatch_log_metric_filters.result.metricName, null) + namespace = try(data.external.cloudwatch_log_metric_filters.result.metricNamespace, null) + + dimensions = { + PodApp = each.key + PodNamespace = var.env + } + + comparison_operator = "GreaterThanOrEqualToThreshold" + statistic = "Sum" + treat_missing_data = "notBreaching" + + threshold = 1 + period = 60 # 1 minute + evaluation_periods = 5 + datapoints_to_alarm = 1 + + tags = var.tags +} diff --git a/terraform/k8s-monitoring/20-k8s-monitoring-deployments.tf b/terraform/k8s-monitoring/20-k8s-monitoring-deployments.tf new file mode 100644 index 0000000..2bdda3b --- /dev/null +++ b/terraform/k8s-monitoring/20-k8s-monitoring-deployments.tf @@ -0,0 +1,35 @@ +data "local_file" "microservices_list" { + filename = "${path.module}/assets/microservices-list.json" +} + +locals { + microservices_names = jsondecode(data.local_file.microservices_list.content) +} + +module "k8s_deployment_monitoring" { + for_each = toset(local.microservices_names) + + source = "git::https://github.com/pagopa/interop-infra-commons//terraform/modules/k8s-deployment-monitoring?ref=v1.4.5" + + env = var.env + eks_cluster_name = var.eks_cluster_name + k8s_namespace = var.env + k8s_deployment_name = each.key + sns_topics_arns = [data.aws_sns_topic.platform_alarms.arn] + + create_pod_availability_alarm = false + create_pod_readiness_alarm = true + create_performance_alarm = true + create_app_logs_errors_alarm = true + + avg_cpu_alarm_threshold = 70 + avg_memory_alarm_threshold = 70 + performance_alarms_period_seconds = 300 # 5 minutes + + create_dashboard = true + + cloudwatch_app_logs_errors_metric_name = try(data.external.cloudwatch_log_metric_filters.result.metricName, null) + cloudwatch_app_logs_errors_metric_namespace = try(data.external.cloudwatch_log_metric_filters.result.metricNamespace, null) + + tags = var.tags +} \ No newline at end of file diff --git a/terraform/k8s-monitoring/98-variables.tf b/terraform/k8s-monitoring/98-variables.tf new file mode 100644 index 0000000..04469ac --- /dev/null +++ b/terraform/k8s-monitoring/98-variables.tf @@ -0,0 +1,31 @@ +variable "aws_region" { + type = string + description = "AWS region" +} + +variable "env" { + type = string + description = "Environment name" +} + +variable "tags" { + type = map(any) + default = { + CreatedBy = "Terraform" + } +} + +variable "eks_cluster_name" { + type = string + description = "Name of the tracing EKS cluster" +} + +variable "sns_topic_name" { + description = "Name of the SNS topic for alarms notifications" + type = string +} + +variable "cloudwatch_log_group_name" { + description = "Name of the Cloudwatch log group to get metric filters" + type = string +} \ No newline at end of file diff --git a/terraform/k8s-monitoring/99-outputs.tf b/terraform/k8s-monitoring/99-outputs.tf new file mode 100644 index 0000000..e69de29 diff --git a/terraform/k8s-monitoring/env/dev/backend.tfvars b/terraform/k8s-monitoring/env/dev/backend.tfvars new file mode 100644 index 0000000..afbbeeb --- /dev/null +++ b/terraform/k8s-monitoring/env/dev/backend.tfvars @@ -0,0 +1,4 @@ +bucket = "terraform-backend-590183909663" +key = "dev/interop-tracing-deployment/monitoring.tfstate" +region = "eu-south-1" +dynamodb_table = "terraform-lock" diff --git a/terraform/k8s-monitoring/env/dev/terraform.tfvars b/terraform/k8s-monitoring/env/dev/terraform.tfvars new file mode 100644 index 0000000..0c121fa --- /dev/null +++ b/terraform/k8s-monitoring/env/dev/terraform.tfvars @@ -0,0 +1,15 @@ +aws_region = "eu-south-1" +env = "dev" + +tags = { + CreatedBy = "Terraform" + Environment = "dev" + Owner = "PagoPA" + Source = "https://github.com/pagopa/interop-tracing-deployment" +} + +eks_cluster_name = "tracing-eks-cluster-dev" + +sns_topic_name = "tracing-platform-alarms-dev" + +cloudwatch_log_group_name = "/aws/eks/tracing-eks-cluster-dev/application" \ No newline at end of file diff --git a/terraform/k8s-monitoring/terraform.sh b/terraform/k8s-monitoring/terraform.sh new file mode 100755 index 0000000..2374402 --- /dev/null +++ b/terraform/k8s-monitoring/terraform.sh @@ -0,0 +1 @@ +. ../terraform.sh diff --git a/terraform/terraform.sh b/terraform/terraform.sh new file mode 100755 index 0000000..16ce63b --- /dev/null +++ b/terraform/terraform.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +set -e + +action=$1 +env=$2 +shift 2 +other=$@ + +if [ -z "$action" ]; then + echo "Missed action: init, apply, plan" + exit 0 +fi + +if [ -z "$env" ]; then + echo "env should be: dev, uat or prod." + exit 0 +fi + +function tf_summarize() { + local plan_file="tfplan-$(date +'%Y%m%d-%H%M%S')" + + echo "Running terraform plan and tf-summarize..." + terraform plan -out="${plan_file}" -var-file="./env/$env/terraform.tfvars" > /dev/null + + set +e # don't stop on failure so that we can cleanup plan_file + if [ -n "$(command -v tf-summarize)" ]; then + tf-summarize ${other:+"$other"} "${plan_file}" + else + echo "tf-summarize binary not found" + exit 1 + fi + + rm "$plan_file" + set -e +} + +function target_action() { + local target_files="$@" + local tf_targets=() + + + if [[ -z $target_files ]]; then + echo "Missing target files argument" + exit 1 + fi + + for file in $target_files; do + if [ ! -f "$file" ]; then + echo "File $file not found." + exit 1 + fi + done + + local temp_file=$(mktemp) + for file in $target_files; do + set +e + grep -E '^resource|^module|^data' $file >> $temp_file + set -e + done + + local resource_type + local module_name + local resource_class + local resource_name + + while read -r line ; do + resource_type=$(echo $line | cut -d '"' -f 1 | tr -d ' ') + if [ "$resource_type" == "module" ]; then + module_name=$(echo $line | cut -d '"' -f 2) + tf_targets+=("-target=module.$module_name ") + elif [ "$resource_type" == "data" ]; then + resource_class=$(echo $line | cut -d '"' -f 2) + resource_name=$(echo $line | cut -d '"' -f 4) + tf_targets+=("-target=data.$resource_class.$resource_name ") + else + resource_class=$(echo $line | cut -d '"' -f 2) + resource_name=$(echo $line | cut -d '"' -f 4) + tf_targets+=("-target=$resource_class.$resource_name ") + fi + done < $temp_file + + rm $temp_file + + printf '%s\n' "${tf_targets[@]}" + terraform $action -var-file="./env/$env/terraform.tfvars" "${tf_targets[@]}" +} + +if echo "init plan apply refresh import output state taint destroy summ" | grep -w $action > /dev/null; then + if [ $action = "init" ]; then + terraform $action -backend-config="./env/$env/backend.tfvars" $other + elif [ $action = "output" ] || [ $action = "state" ] || [ $action = "taint" ]; then + # init terraform backend + terraform init -reconfigure -backend-config="./env/$env/backend.tfvars" + terraform $action $other + elif [ $action = "summ" ]; then + terraform init -reconfigure -backend-config="./env/$env/backend.tfvars" + tf_summarize + elif [[ $action =~ plan|apply|destroy ]] && [[ $other =~ ^-target-files[[:space:]] ]]; then + terraform init -reconfigure -backend-config="./env/$env/backend.tfvars" + shift 1 + target_action "$@" + else + # init terraform backend + terraform init -reconfigure -backend-config="./env/$env/backend.tfvars" + terraform $action -var-file="./env/$env/terraform.tfvars" $other + fi +else + echo "Action not allowed." + exit 1 +fi