Skip to content

Commit

Permalink
feat: automating K8s workload monitoring (#16)
Browse files Browse the repository at this point in the history
  • Loading branch information
micheledellipaoli-pagopa authored Jan 3, 2025
1 parent af3017b commit 0c2045b
Show file tree
Hide file tree
Showing 17 changed files with 470 additions and 1 deletion.
8 changes: 8 additions & 0 deletions .github/workflows/k8s-apply.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,11 @@ jobs:
ecs_cluster_name: ${{ secrets.ECS_CLUSTER_NAME }}
pat_token: ${{ secrets.BOT_TOKEN }}
environment: ${{ inputs.environment }}

tf_apply:
secrets: inherit
uses: ./.github/workflows/tf-apply.yaml
with:
environment: ${{ inputs.environment }}
timeout_seconds: 300

105 changes: 105 additions & 0 deletions .github/workflows/tf-apply.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
name: TF Apply

on:
workflow_call:
inputs:
environment:
description: 'Environment to run apply against'
required: true
type: string
timeout_seconds:
description: 'Terraform apply wait timeout in seconds'
required: true
type: number
secrets:
AWS_REGION:
required: true
TERRAFORM_IAM_ROLE_ARN:
required: true

defaults:
run:
shell: bash

jobs:
workflow_setup:
name: Setup steps
runs-on: ubuntu-latest
environment: ${{ inputs.environment }}
outputs:
microservices: ${{ steps.set-outputs.outputs.microservices }}
cronjobs: ${{ steps.set-outputs.outputs.cronjobs }}
steps:
- name: Checkout
id: checkout
uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29

- name: Normalize environment
id: norm_env
run: |
GH_ENV="${{ inputs.environment }}"
NORM_ENV="$(echo "$GH_ENV" | sed -e 's/-tf//')"
echo "NORM_ENV=$NORM_ENV" >> $GITHUB_ENV
- id: set-outputs
run: |
echo "microservices=$(find microservices -type f -path "*/$NORM_ENV/values.yaml" -exec dirname {} \; | awk -F'/' '{print $2}' | jq -R -s -c 'split("\n")[:-1]')" >> "$GITHUB_OUTPUT"
echo "cronjobs=$(find jobs -type f -path "*/$NORM_ENV/values.yaml" -exec dirname {} \; | awk -F'/' '{print $2}' | jq -R -s -c 'split("\n")[:-1]')" >> "$GITHUB_OUTPUT"
terraform_apply_monitoring:
name: Terraform Apply Monitoring
needs: workflow_setup
runs-on: ubuntu-latest
timeout-minutes: 3
environment: ${{ inputs.environment }}

steps:
- name: Checkout
uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c

- name: Create microservices JSON
working-directory: terraform/k8s-monitoring/
run: |
mkdir assets
echo '${{ needs.workflow_setup.outputs.microservices }}' | jq -c 'sort' > assets/microservices-list.json
- name: Create cronjobs JSON
working-directory: terraform/k8s-monitoring/
run: |
echo '${{ needs.workflow_setup.outputs.cronjobs }}' | jq -c 'sort' > assets/cronjobs-list.json
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@67fbcbb121271f7775d2e7715933280b06314838
with:
role-to-assume: ${{ secrets.TERRAFORM_IAM_ROLE_ARN }}
aws-region: ${{ secrets.AWS_REGION }}

- name: Read Terraform version
id: read-version
working-directory: terraform/
run: |
echo "TERRAFORM_VERSION=$(cat ./.terraform-version)" >> $GITHUB_ENV
- name: Setup Terraform
uses: hashicorp/setup-terraform@633666f66e0061ca3b725c73b2ec20cd13a8fdd1
with:
terraform_version: ${{ env.TERRAFORM_VERSION }}

- name: Normalize environment
id: norm_env
run: |
GH_ENV="${{ inputs.environment }}"
NORM_ENV="$(echo "$GH_ENV" | sed -e 's/-tf//')"
echo "NORM_ENV=$NORM_ENV" >> $GITHUB_ENV
- name: Terraform Init
id: terraform_init
working-directory: terraform/k8s-monitoring
run: |
./terraform.sh init "$NORM_ENV"
- name: Terraform Apply Monitoring
id: terraform_apply_monitoring
working-directory: terraform/k8s-monitoring
run: |
terraform apply -var-file="./env/$NORM_ENV/terraform.tfvars" -auto-approve
23 changes: 22 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,25 @@ charts/
Chart.lock
out*
.DS_Store
**/*.compiled.yaml
**/*.compiled.yaml

# Local .terraform directories
**/.terraform/*

# .tfstate files
*.tfstate
*.tfstate.*

# Crash log files
crash.log

# Ignore override files as they are usually used to override resources locally and so
# are not checked in
override.tf
override.tf.json
*_override.tf
*_override.tf.json

# Ignore JSON microservices list
terraform/k8s-monitoring/assets/microservices-list.json
terraform/k8s-monitoring/assets/cronjobs-list.json
11 changes: 11 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
repos:
- repo: https://github.com/antonbabenko/pre-commit-terraform
rev: v1.96.2
hooks:
- id: terraform_fmt
# TODO: issue when validating modules
# - id: terraform_validate
# args:
# - --init-args=-lockfile=readonly
# - --args=-json
# - --args=-no-color
1 change: 1 addition & 0 deletions terraform/.terraform-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1.8.0
63 changes: 63 additions & 0 deletions terraform/k8s-monitoring/.terraform.lock.hcl

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions terraform/k8s-monitoring/00-main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
terraform {
required_version = "~> 1.8.0"

backend "s3" {}

required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.46.0"
}
}
}

provider "aws" {
region = var.aws_region

default_tags {
tags = var.tags
}
}

data "aws_caller_identity" "current" {}
3 changes: 3 additions & 0 deletions terraform/k8s-monitoring/10-cloudwatch.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
data "external" "cloudwatch_log_metric_filters" {
program = ["aws", "logs", "describe-metric-filters", "--log-group-name", "${var.cloudwatch_log_group_name}", "--output", "json", "--query", "metricFilters[0].metricTransformations[0].{metricName: metricName, metricNamespace: metricNamespace}"]
}
3 changes: 3 additions & 0 deletions terraform/k8s-monitoring/10-sns.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
data "aws_sns_topic" "platform_alarms" {
name = var.sns_topic_name
}
35 changes: 35 additions & 0 deletions terraform/k8s-monitoring/20-k8s-monitoring-cronjobs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
data "local_file" "cronjobs_list" {
filename = "${path.module}/assets/cronjobs-list.json"
}

locals {
cronjobs_names = jsondecode(data.local_file.cronjobs_list.content)
}

resource "aws_cloudwatch_metric_alarm" "cronjob_errors" {
for_each = toset(local.cronjobs_names)

alarm_name = format("k8s-cronjob-%s-errors-%s", each.key, var.env)
alarm_description = format("Cronjob errors alarm for %s", each.key)

alarm_actions = [data.aws_sns_topic.platform_alarms.arn]

metric_name = try(data.external.cloudwatch_log_metric_filters.result.metricName, null)
namespace = try(data.external.cloudwatch_log_metric_filters.result.metricNamespace, null)

dimensions = {
PodApp = each.key
PodNamespace = var.env
}

comparison_operator = "GreaterThanOrEqualToThreshold"
statistic = "Sum"
treat_missing_data = "notBreaching"

threshold = 1
period = 60 # 1 minute
evaluation_periods = 5
datapoints_to_alarm = 1

tags = var.tags
}
35 changes: 35 additions & 0 deletions terraform/k8s-monitoring/20-k8s-monitoring-deployments.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
data "local_file" "microservices_list" {
filename = "${path.module}/assets/microservices-list.json"
}

locals {
microservices_names = jsondecode(data.local_file.microservices_list.content)
}

module "k8s_deployment_monitoring" {
for_each = toset(local.microservices_names)

source = "git::https://github.com/pagopa/interop-infra-commons//terraform/modules/k8s-deployment-monitoring?ref=v1.4.5"

env = var.env
eks_cluster_name = var.eks_cluster_name
k8s_namespace = var.env
k8s_deployment_name = each.key
sns_topics_arns = [data.aws_sns_topic.platform_alarms.arn]

create_pod_availability_alarm = false
create_pod_readiness_alarm = true
create_performance_alarm = true
create_app_logs_errors_alarm = true

avg_cpu_alarm_threshold = 70
avg_memory_alarm_threshold = 70
performance_alarms_period_seconds = 300 # 5 minutes

create_dashboard = true

cloudwatch_app_logs_errors_metric_name = try(data.external.cloudwatch_log_metric_filters.result.metricName, null)
cloudwatch_app_logs_errors_metric_namespace = try(data.external.cloudwatch_log_metric_filters.result.metricNamespace, null)

tags = var.tags
}
31 changes: 31 additions & 0 deletions terraform/k8s-monitoring/98-variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
variable "aws_region" {
type = string
description = "AWS region"
}

variable "env" {
type = string
description = "Environment name"
}

variable "tags" {
type = map(any)
default = {
CreatedBy = "Terraform"
}
}

variable "eks_cluster_name" {
type = string
description = "Name of the tracing EKS cluster"
}

variable "sns_topic_name" {
description = "Name of the SNS topic for alarms notifications"
type = string
}

variable "cloudwatch_log_group_name" {
description = "Name of the Cloudwatch log group to get metric filters"
type = string
}
Empty file.
4 changes: 4 additions & 0 deletions terraform/k8s-monitoring/env/dev/backend.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
bucket = "terraform-backend-590183909663"
key = "dev/interop-tracing-deployment/monitoring.tfstate"
region = "eu-south-1"
dynamodb_table = "terraform-lock"
15 changes: 15 additions & 0 deletions terraform/k8s-monitoring/env/dev/terraform.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
aws_region = "eu-south-1"
env = "dev"

tags = {
CreatedBy = "Terraform"
Environment = "dev"
Owner = "PagoPA"
Source = "https://github.com/pagopa/interop-tracing-deployment"
}

eks_cluster_name = "tracing-eks-cluster-dev"

sns_topic_name = "tracing-platform-alarms-dev"

cloudwatch_log_group_name = "/aws/eks/tracing-eks-cluster-dev/application"
1 change: 1 addition & 0 deletions terraform/k8s-monitoring/terraform.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
. ../terraform.sh
Loading

0 comments on commit 0c2045b

Please sign in to comment.