From c2896d08483ac929ccac336e0a78b8c92be61de8 Mon Sep 17 00:00:00 2001 From: jtydlcak <139967002+jtydlack@users.noreply.github.com> Date: Tue, 6 Feb 2024 13:17:58 +0100 Subject: [PATCH] Add json output option for recommender (#511) --- .gitignore | 1 + config/recommender_config.yaml | 1 + kraken/chaos_recommender/analysis.py | 56 ++++++++++++++++-- kraken/chaos_recommender/prometheus.py | 7 +-- utils/chaos_recommender/README.md | 1 + utils/chaos_recommender/chaos_recommender.py | 62 +++++++++++++++++--- 6 files changed, 111 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index c510e7ead..4fb279b1f 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ __pycache__/* *.out kube-burner* kube_burner* +recommender_*.json # Project files .ropeproject diff --git a/config/recommender_config.yaml b/config/recommender_config.yaml index 5c235faa6..b0234432c 100644 --- a/config/recommender_config.yaml +++ b/config/recommender_config.yaml @@ -7,6 +7,7 @@ auth_token: scrape_duration: 10m chaos_library: "kraken" log_level: INFO +JSON_output: False # for output purpose only do not change if not needed chaos_tests: diff --git a/kraken/chaos_recommender/analysis.py b/kraken/chaos_recommender/analysis.py index 40db4f5cc..bacf182bf 100644 --- a/kraken/chaos_recommender/analysis.py +++ b/kraken/chaos_recommender/analysis.py @@ -10,7 +10,8 @@ KRAKEN_TESTS_PATH = "./kraken_chaos_tests.txt" -#Placeholder, this should be done with topology + +# Placeholder, this should be done with topology def return_critical_services(): return ["web", "cart"] @@ -19,6 +20,7 @@ def load_telemetry_data(file_path): data = pd.read_csv(file_path, delimiter=r"\s+") return data + def calculate_zscores(data): zscores = pd.DataFrame() zscores["Service"] = data["service"] @@ -27,6 +29,7 @@ def calculate_zscores(data): zscores["Network"] = (data["NETWORK"] - data["NETWORK"].mean()) / data["NETWORK"].std() return zscores + def identify_outliers(data): outliers_cpu = data[data["CPU"] > threshold]["Service"].tolist() outliers_memory = data[data["Memory"] > threshold]["Service"].tolist() @@ -47,7 +50,7 @@ def get_services_above_heatmap_threshold(dataframe, cpu_threshold, mem_threshold return cpu_services, mem_services -def analysis(file_path, chaos_tests_config): +def analysis(file_path, chaos_tests_config, json_output): # Load the telemetry data from file data = load_telemetry_data(file_path) @@ -63,28 +66,71 @@ def analysis(file_path, chaos_tests_config): logging.info(f"CPU outliers: {outliers_cpu}") logging.info(f"Memory outliers: {outliers_memory}") logging.info(f"Network outliers: {outliers_network}") + logging.info("===================== HeatMap Analysis ==============================") if cpu_services: logging.info("Services with CPU_HEATMAP above threshold:", cpu_services) else: - logging.info("There are no services that are using siginificant CPU compared to their assigned limits (infinite in case no limits are set).") + logging.info("There are no services that are using significant CPU compared to their assigned limits (infinite in case no limits are set).") if mem_services: logging.info("Services with MEM_HEATMAP above threshold:", mem_services) else: - logging.info("There are no services that are using siginificant MEMORY compared to their assigned limits (infinite in case no limits are set).") + logging.info("There are no services that are using significant MEMORY compared to their assigned limits (infinite in case no limits are set).") time.sleep(2) + logging.info("======================= Recommendations =============================") + if cpu_services: logging.info(f"Recommended tests for {str(cpu_services)} :\n {chaos_tests_config['CPU']}") logging.info("\n") + if mem_services: logging.info(f"Recommended tests for {str(mem_services)} :\n {chaos_tests_config['MEM']}") logging.info("\n") if outliers_network: - logging.info(f"Recommended tests for str(outliers_network) :\n {chaos_tests_config['NETWORK']}") + logging.info(f"Recommended tests for {str(outliers_network)} :\n {chaos_tests_config['NETWORK']}") logging.info("\n") logging.info("\n") logging.info("Please check data in utilisation.txt for further analysis") + + if json_output is True: + analysis_data = json_struct(outliers_cpu, outliers_memory, outliers_network, cpu_services, mem_services, chaos_tests_config) + return analysis_data + + +def json_struct(outliers_cpu, outliers_memory, outliers_network, cpu_services, + mem_services, chaos_tests_config): + + profiling = { + "cpu_outliers": outliers_cpu, + "memory_outliers": outliers_memory, + "network_outliers": outliers_network + } + + heatmap = { + "services_with_cpu_heatmap_above_threshold": cpu_services, + "services_with_mem_heatmap_above_threshold": mem_services + } + + recommendations = {} + + if cpu_services: + cpu_recommend = {"services": cpu_services, + "tests": chaos_tests_config['CPU']} + recommendations["cpu_services_recommendations"] = cpu_recommend + + if mem_services: + mem_recommend = {"services": mem_services, + "tests": chaos_tests_config['MEM']} + recommendations["mem_services_recommendations"] = mem_recommend + + if outliers_network: + outliers_network_recommend = {"outliers_networks": outliers_network, + "tests": chaos_tests_config['NETWORK']} + recommendations["outliers_network_recommendations"] = ( + outliers_network_recommend) + + return [profiling, heatmap, recommendations] diff --git a/kraken/chaos_recommender/prometheus.py b/kraken/chaos_recommender/prometheus.py index ba9d913e4..3d7861fe1 100644 --- a/kraken/chaos_recommender/prometheus.py +++ b/kraken/chaos_recommender/prometheus.py @@ -57,6 +57,7 @@ def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_r merged_df.to_csv(filename, sep='\t', index=False) + def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration): urllib3.disable_warnings() prometheus = PrometheusConnect(url=prometheus_endpoint, headers={'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True) @@ -89,8 +90,6 @@ def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace logging.info(network_query) network_data = network_result - + queries = [cpu_query, cpu_limits_query, mem_query, mem_limits_query] save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, saved_metrics_path) - return saved_metrics_path - - + return saved_metrics_path, queries diff --git a/utils/chaos_recommender/README.md b/utils/chaos_recommender/README.md index e9aefc2a2..877b680f0 100644 --- a/utils/chaos_recommender/README.md +++ b/utils/chaos_recommender/README.md @@ -74,6 +74,7 @@ You can also provide the input values through command-line arguments launching t Chaos library -L LOG_LEVEL, --log-level LOG_LEVEL log level (DEBUG, INFO, WARNING, ERROR, CRITICAL + -J, --json-output Makes JSON output -M MEM [MEM ...], --MEM MEM [MEM ...] Memory related chaos tests (space separated list) -C CPU [CPU ...], --CPU CPU [CPU ...] diff --git a/utils/chaos_recommender/chaos_recommender.py b/utils/chaos_recommender/chaos_recommender.py index 23629cc85..54cae55bc 100644 --- a/utils/chaos_recommender/chaos_recommender.py +++ b/utils/chaos_recommender/chaos_recommender.py @@ -1,8 +1,12 @@ import argparse +import json import logging import os.path import sys +import time + import yaml + # kraken module import for running the recommender # both from the root directory and the recommender # folder @@ -14,7 +18,6 @@ from kubernetes import config as kube_config - def parse_arguments(parser): # command line options @@ -27,6 +30,9 @@ def parse_arguments(parser): parser.add_argument("-s", "--scrape-duration", action="store", default="10m", help="Prometheus scrape duration") parser.add_argument("-L", "--log-level", action="store", default="INFO", help="log level (DEBUG, INFO, WARNING, ERROR, CRITICAL") + parser.add_argument("-J", "--json-output", action="store_true", + help="Makes json output") + parser.add_argument("-M", "--MEM", nargs='+', action="store", default=[], help="Memory related chaos tests (space separated list)") parser.add_argument("-C", "--CPU", nargs='+', action="store", default=[], @@ -36,9 +42,9 @@ def parse_arguments(parser): parser.add_argument("-G", "--GENERIC", nargs='+', action="store", default=[], help="Memory related chaos tests (space separated list)") - return parser.parse_args() + def read_configuration(config_file_path): if not os.path.exists(config_file_path): logging.error(f"Config file not found: {config_file_path}") @@ -54,9 +60,11 @@ def read_configuration(config_file_path): prometheus_endpoint = config.get("prometheus_endpoint", "") auth_token = config.get("auth_token", "") scrape_duration = config.get("scrape_duration", "10m") - chaos_tests = config.get("chaos_tests" , {}) + chaos_tests = config.get("chaos_tests", {}) + json_output = config.get("JSON_output", False) return (namespace, kubeconfig, prometheus_endpoint, auth_token, scrape_duration, - chaos_tests, log_level) + chaos_tests, log_level, json_output) + def prompt_input(prompt, default_value): user_input = input(f"{prompt} [{default_value}]: ") @@ -64,6 +72,35 @@ def prompt_input(prompt, default_value): return user_input return default_value + +def make_json_output(file, namespace, kubeconfig, prometheus_endpoint, chaos_tests, scrape_duration, queries, analysis_data): + data = { + "inputs": [ + { + "namespace": namespace, + "kubeconfig": kubeconfig, + "prometheus_endpoint": prometheus_endpoint, + "scrape_duration": scrape_duration, + "chaos_tests": chaos_tests + } + ], + "analysis_start": [ + { + "cpu_query": queries[0], + "cpu_limit_query": queries[1], + "memory_query": queries[2], + "memory_limit_query": queries[3] + } + ], + "profiling": analysis_data[0], + "heatmap_analysis": analysis_data[1], + "recommendations": analysis_data[2] + } + file = f"utils/chaos_recommender/recommender_output/{file}" + with open(file, "w") as json_output: + json_output.write(json.dumps(data, indent=4)) + + def main(): parser = argparse.ArgumentParser(description="Krkn Chaos Recommender Command-Line tool") args = parse_arguments(parser) @@ -81,7 +118,8 @@ def main(): auth_token, scrape_duration, chaos_tests, - log_level + log_level, + json_output ) = read_configuration(args.config_file) if args.options: @@ -91,9 +129,10 @@ def main(): scrape_duration = args.scrape_duration log_level = args.log_level prometheus_endpoint = args.prometheus_endpoint + json_output = args.json_output chaos_tests = {"MEM": args.MEM, "GENERIC": args.GENERIC, "CPU": args.CPU, "NETWORK": args.NETWORK} - if log_level not in ["DEBUG","INFO", "WARNING", "ERROR","CRITICAL"]: + if log_level not in ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]: logging.error(f"{log_level} not a valid log level") sys.exit(1) @@ -110,8 +149,15 @@ def main(): logging.info("Starting Analysis ...") logging.info("Fetching the Telemetry data") - file_path = prometheus.fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration) - analysis(file_path, chaos_tests) + file_path, queries = prometheus.fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration) + analysis_data = analysis(file_path, chaos_tests, json_output) + + if json_output is True: + time_str = time.strftime("%d%m%y_%H%M%S", time.localtime()) + recommendation_json = f"recommender_{namespace}_{time_str}.json" + make_json_output(recommendation_json, namespace, kubeconfig, prometheus_endpoint, chaos_tests, scrape_duration, queries, analysis_data) + logging.info(f"Recommendation output is in file {recommendation_json} in chaos recommender output folder.") + if __name__ == "__main__": main()