Skip to content

Commit

Permalink
Kubernetes prometheus telemetry + functional tests
Browse files Browse the repository at this point in the history
Signed-off-by: Tullio Sebastiani <[email protected]>
  • Loading branch information
tsebastiani committed Feb 9, 2024
1 parent fa59834 commit 120e419
Show file tree
Hide file tree
Showing 7 changed files with 121 additions and 25 deletions.
39 changes: 33 additions & 6 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
name: Functional & Unit Tests
on:
pull_request:
push:
branches:
- main
jobs:
tests:
# Common steps
name: Functional & Unit Tests
runs-on: ubuntu-latest
steps:
Expand Down Expand Up @@ -47,8 +51,7 @@ jobs:
sudo apt-get install build-essential python3-dev
pip install --upgrade pip
pip install -r requirements.txt
# - name: Run unit tests
# run: python -m coverage run -a -m unittest discover -s tests -v
- name: Deploy test workloads
run: |
kubectl apply -f CI/templates/outage_pod.yaml
Expand All @@ -61,10 +64,14 @@ jobs:
- name: Get Kind nodes
run: |
kubectl get nodes --show-labels=true
# Pull request only steps
- name: Run unit tests
if: github.event_name == 'pull_request'
run: python -m coverage run -a -m unittest discover -s tests -v

- name: Setup Functional Tests
- name: Setup Pull Request Functional Tests
if: github.event_name == 'pull_request'
run: |
yq -i '.kraken.distribution="kubernetes"' CI/config/common_test_config.yaml
yq -i '.kraken.port="8081"' CI/config/common_test_config.yaml
yq -i '.kraken.signal_address="0.0.0.0"' CI/config/common_test_config.yaml
yq -i '.kraken.performance_monitoring="localhost:9090"' CI/config/common_test_config.yaml
Expand All @@ -76,13 +83,33 @@ jobs:
echo "test_arca_cpu_hog" >> ./CI/tests/functional_tests
echo "test_arca_memory_hog" >> ./CI/tests/functional_tests
echo "test_arca_io_hog" >> ./CI/tests/functional_tests
# Push on main only steps
- name: Configure AWS Credentials
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region : ${{ secrets.AWS_REGION }}
- name: Setup Post Merge Request Functional Tests
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
run: |
yq -i '.kraken.port="8081"' CI/config/common_test_config.yaml
yq -i '.kraken.signal_address="0.0.0.0"' CI/config/common_test_config.yaml
yq -i '.kraken.performance_monitoring="localhost:9090"' CI/config/common_test_config.yaml
yq -i '.telemetry.username="${{secrets.TELEMETRY_USERNAME}}"' CI/config/common_test_config.yaml
yq -i '.telemetry.password="${{secrets.TELEMETRY_PASSWORD}}"' CI/config/common_test_config.yaml
echo "test_telemetry" > ./CI/tests/functional_tests
# Final common steps
- name: Run Functional tests
env:
AWS_BUCKET: ${{ secrets.AWS_BUCKET }}
run: |
./CI/run.sh
cat ./CI/results.markdown >> $GITHUB_STEP_SUMMARY
echo >> $GITHUB_STEP_SUMMARY
- name: Run Unit tests
run: python -m coverage run -a -m unittest discover -s tests -v
- name: Upload CI logs
uses: actions/upload-artifact@v3
with:
Expand Down
16 changes: 13 additions & 3 deletions CI/config/common_test_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
kraken:
distribution: openshift # Distribution can be kubernetes or openshift.
distribution: kubernetes # Distribution can be kubernetes or openshift.
kubeconfig_path: ~/.kube/config # Path to kubeconfig.
exit_on_failure: False # Exit when a post action scenario fails.
litmus_version: v1.13.6 # Litmus version to install.
Expand Down Expand Up @@ -30,12 +30,22 @@ tunings:
telemetry:
enabled: False # enable/disables the telemetry collection feature
api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
username: username # telemetry service username
password: password # telemetry service password
username: $TELEMETRY_USERNAME # telemetry service username
password: $TELEMETRY_PASSWORD # telemetry service password
prometheus_namespace: 'prometheus-k8s' # prometheus namespace
prometheus_pod_name: 'prometheus-kind-prometheus-kube-prome-prometheus-0' # prometheus pod_name
prometheus_container_name: 'prometheus'
prometheus_backup: True # enables/disables prometheus data collection
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
backup_threads: 5 # number of telemetry download/upload threads
archive_path: /tmp # local path where the archive files will be temporarly stored
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
archive_size: 10000 # the size of the prometheus data archive size in KB. The lower the size of archive is
logs_backup: True
logs_filter_patterns:
- "(\\w{3}\\s\\d{1,2}\\s\\d{2}:\\d{2}:\\d{2}\\.\\d+).+" # Sep 9 11:20:36.123425532
- "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+" # kinit 2023/09/15 11:20:36 log
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
events_backup: True # enables/disables cluster events collection
3 changes: 2 additions & 1 deletion CI/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ echo '-----------------------|--------|---------' >> $results
failed_tests=()
for test_name in `cat CI/tests/functional_tests`
do
wait_cluster_become_ready
#wait_cluster_become_ready
return_value=`./CI/run_test.sh $test_name $results`
if [[ $return_value == 1 ]]
then
Expand All @@ -49,6 +49,7 @@ do
wait_cluster_become_ready
done


if (( ${#failed_tests[@]}>0 ))
then
echo -e "\n\n======================================================================"
Expand Down
33 changes: 33 additions & 0 deletions CI/tests/test_telemetry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
set -xeEo pipefail

source CI/tests/common.sh

trap error ERR
trap finish EXIT


function functional_test_telemetry {
AWS_CLI=`which aws`
[ -z "$AWS_CLI" ]&& echo "AWS cli not found in path" && exit 1
[ -z "$AWS_BUCKET" ] && echo "AWS bucket not set in environment" && exit 1

export RUN_TAG="funtest-telemetry"
yq -i '.telemetry.enabled=True' CI/config/common_test_config.yaml
yq -i '.telemetry.full_prometheus_backup=True' CI/config/common_test_config.yaml
yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml
export scenario_type="arcaflow_scenarios"
export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml"
export post_config=""
envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml
RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/download/(.*)#\1#p"`
$AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files
echo "checking if telemetry files are uploaded on s3"
cat s3_remote_files | grep events-00.json || ( echo "FAILED: events-00.json not uploaded" && exit 1 )
cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 )
cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 )
echo "all files uploaded!"
echo "Telemetry Collection: Success"
}

functional_test_telemetry
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ ibm_cloud_sdk_core
ibm_vpc
itsdangerous==2.0.1
jinja2==3.1.3
krkn-lib >= 1.4.6
krkn-lib >= 1.4.9
kubernetes
lxml >= 4.3.0
oauth2client>=4.1.3
openshift-client
openshift-client == 1.0.21
paramiko
podman-compose
pyVmomi >= 6.7
Expand Down
48 changes: 37 additions & 11 deletions run_kraken.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,12 +156,17 @@ def main(cfg):
# Cluster info
logging.info("Fetching cluster info")
cv = ""
if config["kraken"]["distribution"] == "openshift":
if distribution == "openshift":
cv = ocpcli.get_clusterversion_string()
if prometheus_url is None:
connection_data = ocpcli.get_prometheus_api_connection_data()
prometheus_url = connection_data.endpoint
prometheus_bearer_token = connection_data.token
try:
connection_data = ocpcli.get_prometheus_api_connection_data()
prometheus_url = connection_data.endpoint
prometheus_bearer_token = connection_data.token
except Exception:
logging.error("invalid distribution selected, running openshift scenarios against kubernetes cluster."
"Please set 'kubernetes' in config.yaml krkn.platform and try again")
sys.exit(1)
if cv != "":
logging.info(cv)
else:
Expand Down Expand Up @@ -366,7 +371,7 @@ def main(cfg):
# if platform is openshift will be collected
# Cloud platform and network plugins metadata
# through OCP specific APIs
if config["kraken"]["distribution"] == "openshift":
if distribution == "openshift":
telemetry_ocp.collect_cluster_metadata(chaos_telemetry)
else:
telemetry_k8s.collect_cluster_metadata(chaos_telemetry)
Expand All @@ -381,12 +386,33 @@ def main(cfg):
telemetry_k8s.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
telemetry_k8s.put_cluster_events(telemetry_request_id, config["telemetry"], start_time, end_time)
# prometheus data collection is available only on Openshift
if config["telemetry"]["prometheus_backup"] and config["kraken"]["distribution"] == "openshift":
safe_logger.info("archives download started:")
prometheus_archive_files = telemetry_ocp.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id)
safe_logger.info("archives upload started:")
telemetry_k8s.put_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id)
if config["telemetry"]["logs_backup"]:
if config["telemetry"]["prometheus_backup"]:
prometheus_archive_files = ''
if distribution == "openshift" :
prometheus_archive_files = telemetry_ocp.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id)
else:
if (config["telemetry"]["prometheus_namespace"] and
config["telemetry"]["prometheus_pod_name"] and
config["telemetry"]["prometheus_container_name"]):
try:
prometheus_archive_files = telemetry_k8s.get_prometheus_pod_data(
config["telemetry"],
telemetry_request_id,
config["telemetry"]["prometheus_pod_name"],
config["telemetry"]["prometheus_container_name"],
config["telemetry"]["prometheus_namespace"]
)
except Exception as e:
logging.error(f"failed to get prometheus backup with exception {str(e)}")
else:
logging.warning("impossible to backup prometheus,"
"check if config contains telemetry.prometheus_namespace, "
"telemetry.prometheus_pod_name and "
"telemetry.prometheus_container_name")
if prometheus_archive_files:
safe_logger.info("starting prometheus archive upload:")
telemetry_k8s.put_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id)
if config["telemetry"]["logs_backup"] and distribution == "openshift":
telemetry_ocp.put_ocp_logs(telemetry_request_id, config["telemetry"], start_time, end_time)
except Exception as e:
logging.error(f"failed to send telemetry data: {str(e)}")
Expand Down
3 changes: 1 addition & 2 deletions scenarios/arcaflow/cpu-hog/input.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,4 @@ input_list:
duration: 1s
kubeconfig: ''
namespace: default
node_selector:
kubernetes.io/hostname: kind-worker2
node_selector: {}

0 comments on commit 120e419

Please sign in to comment.