From 120e419e98b520055816b0e1259c00d0041255fa Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Fri, 9 Feb 2024 15:51:33 +0100 Subject: [PATCH] Kubernetes prometheus telemetry + functional tests Signed-off-by: Tullio Sebastiani --- .github/workflows/tests.yml | 39 ++++++++++++++++++---- CI/config/common_test_config.yaml | 16 +++++++-- CI/run.sh | 3 +- CI/tests/test_telemetry.sh | 33 ++++++++++++++++++ requirements.txt | 4 +-- run_kraken.py | 48 +++++++++++++++++++++------ scenarios/arcaflow/cpu-hog/input.yaml | 3 +- 7 files changed, 121 insertions(+), 25 deletions(-) create mode 100644 CI/tests/test_telemetry.sh diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a87790218..bac461167 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,8 +1,12 @@ name: Functional & Unit Tests on: pull_request: + push: + branches: + - main jobs: tests: + # Common steps name: Functional & Unit Tests runs-on: ubuntu-latest steps: @@ -47,8 +51,7 @@ jobs: sudo apt-get install build-essential python3-dev pip install --upgrade pip pip install -r requirements.txt -# - name: Run unit tests -# run: python -m coverage run -a -m unittest discover -s tests -v + - name: Deploy test workloads run: | kubectl apply -f CI/templates/outage_pod.yaml @@ -61,10 +64,14 @@ jobs: - name: Get Kind nodes run: | kubectl get nodes --show-labels=true + # Pull request only steps + - name: Run unit tests + if: github.event_name == 'pull_request' + run: python -m coverage run -a -m unittest discover -s tests -v - - name: Setup Functional Tests + - name: Setup Pull Request Functional Tests + if: github.event_name == 'pull_request' run: | - yq -i '.kraken.distribution="kubernetes"' CI/config/common_test_config.yaml yq -i '.kraken.port="8081"' CI/config/common_test_config.yaml yq -i '.kraken.signal_address="0.0.0.0"' CI/config/common_test_config.yaml yq -i '.kraken.performance_monitoring="localhost:9090"' CI/config/common_test_config.yaml @@ -76,13 +83,33 @@ jobs: echo "test_arca_cpu_hog" >> ./CI/tests/functional_tests echo "test_arca_memory_hog" >> ./CI/tests/functional_tests echo "test_arca_io_hog" >> ./CI/tests/functional_tests + + # Push on main only steps + - name: Configure AWS Credentials + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region : ${{ secrets.AWS_REGION }} + - name: Setup Post Merge Request Functional Tests + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + run: | + yq -i '.kraken.port="8081"' CI/config/common_test_config.yaml + yq -i '.kraken.signal_address="0.0.0.0"' CI/config/common_test_config.yaml + yq -i '.kraken.performance_monitoring="localhost:9090"' CI/config/common_test_config.yaml + yq -i '.telemetry.username="${{secrets.TELEMETRY_USERNAME}}"' CI/config/common_test_config.yaml + yq -i '.telemetry.password="${{secrets.TELEMETRY_PASSWORD}}"' CI/config/common_test_config.yaml + echo "test_telemetry" > ./CI/tests/functional_tests + + # Final common steps - name: Run Functional tests + env: + AWS_BUCKET: ${{ secrets.AWS_BUCKET }} run: | ./CI/run.sh cat ./CI/results.markdown >> $GITHUB_STEP_SUMMARY echo >> $GITHUB_STEP_SUMMARY - - name: Run Unit tests - run: python -m coverage run -a -m unittest discover -s tests -v - name: Upload CI logs uses: actions/upload-artifact@v3 with: diff --git a/CI/config/common_test_config.yaml b/CI/config/common_test_config.yaml index c5e758eba..550c54786 100644 --- a/CI/config/common_test_config.yaml +++ b/CI/config/common_test_config.yaml @@ -1,5 +1,5 @@ kraken: - distribution: openshift # Distribution can be kubernetes or openshift. + distribution: kubernetes # Distribution can be kubernetes or openshift. kubeconfig_path: ~/.kube/config # Path to kubeconfig. exit_on_failure: False # Exit when a post action scenario fails. litmus_version: v1.13.6 # Litmus version to install. @@ -30,8 +30,11 @@ tunings: telemetry: enabled: False # enable/disables the telemetry collection feature api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint - username: username # telemetry service username - password: password # telemetry service password + username: $TELEMETRY_USERNAME # telemetry service username + password: $TELEMETRY_PASSWORD # telemetry service password + prometheus_namespace: 'prometheus-k8s' # prometheus namespace + prometheus_pod_name: 'prometheus-kind-prometheus-kube-prome-prometheus-0' # prometheus pod_name + prometheus_container_name: 'prometheus' prometheus_backup: True # enables/disables prometheus data collection full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded. backup_threads: 5 # number of telemetry download/upload threads @@ -39,3 +42,10 @@ telemetry: max_retries: 0 # maximum number of upload retries (if 0 will retry forever) run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs) archive_size: 10000 # the size of the prometheus data archive size in KB. The lower the size of archive is + logs_backup: True + logs_filter_patterns: + - "(\\w{3}\\s\\d{1,2}\\s\\d{2}:\\d{2}:\\d{2}\\.\\d+).+" # Sep 9 11:20:36.123425532 + - "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+" # kinit 2023/09/15 11:20:36 log + - "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log + oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH + events_backup: True # enables/disables cluster events collection diff --git a/CI/run.sh b/CI/run.sh index e3bd9e8fe..0bf8b73d0 100755 --- a/CI/run.sh +++ b/CI/run.sh @@ -39,7 +39,7 @@ echo '-----------------------|--------|---------' >> $results failed_tests=() for test_name in `cat CI/tests/functional_tests` do - wait_cluster_become_ready + #wait_cluster_become_ready return_value=`./CI/run_test.sh $test_name $results` if [[ $return_value == 1 ]] then @@ -49,6 +49,7 @@ do wait_cluster_become_ready done + if (( ${#failed_tests[@]}>0 )) then echo -e "\n\n======================================================================" diff --git a/CI/tests/test_telemetry.sh b/CI/tests/test_telemetry.sh new file mode 100644 index 000000000..531d6942b --- /dev/null +++ b/CI/tests/test_telemetry.sh @@ -0,0 +1,33 @@ +set -xeEo pipefail + +source CI/tests/common.sh + +trap error ERR +trap finish EXIT + + +function functional_test_telemetry { + AWS_CLI=`which aws` + [ -z "$AWS_CLI" ]&& echo "AWS cli not found in path" && exit 1 + [ -z "$AWS_BUCKET" ] && echo "AWS bucket not set in environment" && exit 1 + + export RUN_TAG="funtest-telemetry" + yq -i '.telemetry.enabled=True' CI/config/common_test_config.yaml + yq -i '.telemetry.full_prometheus_backup=True' CI/config/common_test_config.yaml + yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml + export scenario_type="arcaflow_scenarios" + export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml" + export post_config="" + envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml + python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml + RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/download/(.*)#\1#p"` + $AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files + echo "checking if telemetry files are uploaded on s3" + cat s3_remote_files | grep events-00.json || ( echo "FAILED: events-00.json not uploaded" && exit 1 ) + cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 ) + cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 ) + echo "all files uploaded!" + echo "Telemetry Collection: Success" +} + +functional_test_telemetry \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a0a4ae4e9..03d509308 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,11 +19,11 @@ ibm_cloud_sdk_core ibm_vpc itsdangerous==2.0.1 jinja2==3.1.3 -krkn-lib >= 1.4.6 +krkn-lib >= 1.4.9 kubernetes lxml >= 4.3.0 oauth2client>=4.1.3 -openshift-client +openshift-client == 1.0.21 paramiko podman-compose pyVmomi >= 6.7 diff --git a/run_kraken.py b/run_kraken.py index 0bae40675..d08237b4c 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -156,12 +156,17 @@ def main(cfg): # Cluster info logging.info("Fetching cluster info") cv = "" - if config["kraken"]["distribution"] == "openshift": + if distribution == "openshift": cv = ocpcli.get_clusterversion_string() if prometheus_url is None: - connection_data = ocpcli.get_prometheus_api_connection_data() - prometheus_url = connection_data.endpoint - prometheus_bearer_token = connection_data.token + try: + connection_data = ocpcli.get_prometheus_api_connection_data() + prometheus_url = connection_data.endpoint + prometheus_bearer_token = connection_data.token + except Exception: + logging.error("invalid distribution selected, running openshift scenarios against kubernetes cluster." + "Please set 'kubernetes' in config.yaml krkn.platform and try again") + sys.exit(1) if cv != "": logging.info(cv) else: @@ -366,7 +371,7 @@ def main(cfg): # if platform is openshift will be collected # Cloud platform and network plugins metadata # through OCP specific APIs - if config["kraken"]["distribution"] == "openshift": + if distribution == "openshift": telemetry_ocp.collect_cluster_metadata(chaos_telemetry) else: telemetry_k8s.collect_cluster_metadata(chaos_telemetry) @@ -381,12 +386,33 @@ def main(cfg): telemetry_k8s.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry) telemetry_k8s.put_cluster_events(telemetry_request_id, config["telemetry"], start_time, end_time) # prometheus data collection is available only on Openshift - if config["telemetry"]["prometheus_backup"] and config["kraken"]["distribution"] == "openshift": - safe_logger.info("archives download started:") - prometheus_archive_files = telemetry_ocp.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id) - safe_logger.info("archives upload started:") - telemetry_k8s.put_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id) - if config["telemetry"]["logs_backup"]: + if config["telemetry"]["prometheus_backup"]: + prometheus_archive_files = '' + if distribution == "openshift" : + prometheus_archive_files = telemetry_ocp.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id) + else: + if (config["telemetry"]["prometheus_namespace"] and + config["telemetry"]["prometheus_pod_name"] and + config["telemetry"]["prometheus_container_name"]): + try: + prometheus_archive_files = telemetry_k8s.get_prometheus_pod_data( + config["telemetry"], + telemetry_request_id, + config["telemetry"]["prometheus_pod_name"], + config["telemetry"]["prometheus_container_name"], + config["telemetry"]["prometheus_namespace"] + ) + except Exception as e: + logging.error(f"failed to get prometheus backup with exception {str(e)}") + else: + logging.warning("impossible to backup prometheus," + "check if config contains telemetry.prometheus_namespace, " + "telemetry.prometheus_pod_name and " + "telemetry.prometheus_container_name") + if prometheus_archive_files: + safe_logger.info("starting prometheus archive upload:") + telemetry_k8s.put_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id) + if config["telemetry"]["logs_backup"] and distribution == "openshift": telemetry_ocp.put_ocp_logs(telemetry_request_id, config["telemetry"], start_time, end_time) except Exception as e: logging.error(f"failed to send telemetry data: {str(e)}") diff --git a/scenarios/arcaflow/cpu-hog/input.yaml b/scenarios/arcaflow/cpu-hog/input.yaml index 9c80fd378..1361bbaa0 100644 --- a/scenarios/arcaflow/cpu-hog/input.yaml +++ b/scenarios/arcaflow/cpu-hog/input.yaml @@ -5,5 +5,4 @@ input_list: duration: 1s kubeconfig: '' namespace: default - node_selector: - kubernetes.io/hostname: kind-worker2 + node_selector: {}