From ffb92d5fcd8734e11999ebfc30f4564744801229 Mon Sep 17 00:00:00 2001 From: henrick Date: Mon, 4 Nov 2024 20:27:11 -0500 Subject: [PATCH 1/3] Add support for user-provided default network ACL Signed-off-by: henrick --- .../zone_outage/zone_outage_scenario_plugin.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py b/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py index c2a83ee5..ee66dfde 100644 --- a/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py +++ b/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py @@ -29,6 +29,8 @@ def run( subnet_ids = scenario_config["subnet_id"] duration = scenario_config["duration"] cloud_type = scenario_config["cloud_type"] + # Add support for user-provided default network ACL + default_acl_id = scenario_config.get("default_acl_id") ids = {} acl_ids_created = [] @@ -58,7 +60,15 @@ def run( "Network association ids associated with " "the subnet %s: %s" % (subnet_id, network_association_ids) ) - acl_id = cloud_object.create_default_network_acl(vpc_id) + + # Use provided default ACL if available, otherwise create a new one + if default_acl_id: + acl_id = default_acl_id + # Don't add to acl_id since we didn't create it + else: + acl_id = cloud_object.create_default_network_acl(vpc_id) + acl_ids_created.append(acl_id) + new_association_id = cloud_object.replace_network_acl_association( network_association_ids[0], acl_id ) @@ -66,7 +76,6 @@ def run( # capture the orginal_acl_id, created_acl_id and # new association_id to use during the recovery ids[new_association_id] = original_acl_id - acl_ids_created.append(acl_id) # wait for the specified duration logging.info( From 8b1e5675d024b733c6aea1c29e161b14bc3a87f5 Mon Sep 17 00:00:00 2001 From: henrick Date: Tue, 5 Nov 2024 22:23:42 -0500 Subject: [PATCH 2/3] Add logs to notify user when their provided acl is used Signed-off-by: henrick --- .../zone_outage/zone_outage_scenario_plugin.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py b/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py index ee66dfde..bce7d051 100644 --- a/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py +++ b/krkn/scenario_plugins/zone_outage/zone_outage_scenario_plugin.py @@ -64,9 +64,14 @@ def run( # Use provided default ACL if available, otherwise create a new one if default_acl_id: acl_id = default_acl_id - # Don't add to acl_id since we didn't create it + logging.info( + "Using provided default ACL ID %s - this ACL will not be deleted after the scenario", + default_acl_id + ) + # Don't add to acl_ids_created since we don't want to delete user-provided ACLs at cleanup else: acl_id = cloud_object.create_default_network_acl(vpc_id) + logging.info("Created new default ACL %s", acl_id) acl_ids_created.append(acl_id) new_association_id = cloud_object.replace_network_acl_association( From de9de661959056277736ff0f6d69c24d2df56209 Mon Sep 17 00:00:00 2001 From: henrick Date: Wed, 6 Nov 2024 11:50:33 -0500 Subject: [PATCH 3/3] Update docs to include optional default_acl_id parameter in zone_outage Signed-off-by: henrick --- docs/zone_outage.md | 2 ++ scenarios/openshift/zone_outage.yaml | 1 + 2 files changed, 3 insertions(+) diff --git a/docs/zone_outage.md b/docs/zone_outage.md index 019e3fe0..2337baef 100644 --- a/docs/zone_outage.md +++ b/docs/zone_outage.md @@ -13,10 +13,12 @@ zone_outage: # Scenario to create an out duration: 600 # Duration in seconds after which the zone will be back online. vpc_id: # Cluster virtual private network to target. subnet_id: [subnet1, subnet2] # List of subnet-id's to deny both ingress and egress traffic. + default_acl_id: acl-xxxxxxxx # (Optional) ID of an existing network ACL to use instead of creating a new one. If provided, this ACL will not be deleted after the scenario. ``` **NOTE**: vpc_id and subnet_id can be obtained from the cloud web console by selecting one of the instances in the targeted zone ( us-west-2a for example ). **NOTE**: Multiple zones will experience downtime in case of targeting multiple subnets which might have an impact on the cluster health especially if the zones have control plane components deployed. +**NOTE**: default_acl_id can be obtained from the AWS VPC Console by selecting "Network ACLs" from the left sidebar ( the ID will be in the format 'acl-xxxxxxxx' ). Make sure the selected ACL has the desired ingress/egress rules for your outage scenario ( i.e., deny all ). ##### Debugging steps in case of failures In case of failures during the steps which revert back the network acl to allow traffic and bring back the cluster nodes in the zone, the nodes in the particular zone will be in `NotReady` condition. Here is how to fix it: diff --git a/scenarios/openshift/zone_outage.yaml b/scenarios/openshift/zone_outage.yaml index a54c000b..216cf020 100644 --- a/scenarios/openshift/zone_outage.yaml +++ b/scenarios/openshift/zone_outage.yaml @@ -3,3 +3,4 @@ zone_outage: # Scenario to create an out duration: 600 # duration in seconds after which the zone will be back online vpc_id: # cluster virtual private network to target subnet_id: [subnet1, subnet2] # List of subnet-id's to deny both ingress and egress traffic + default_acl_id: acl-xxxxxxxx # (Optional) ID of an existing network ACL to use instead of creating a new one. If provided, this ACL will not be deleted after the scenario.