-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathtpcds-benchmark.yaml
97 lines (97 loc) · 3.2 KB
/
tpcds-benchmark.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# // SPDX-License-Identifier: MIT-0
apiVersion: "sparkoperator.k8s.io/v1beta2"
kind: SparkApplication
metadata:
name: tpcds-benchmark-oss
namespace: oss
spec:
type: Scala
mode: cluster
image: ghcr.io/aws-samples/eks-spark-benchmark:3.1.2
imagePullPolicy: IfNotPresent
sparkVersion: 3.1.2
mainClass: com.amazonaws.eks.tpcds.BenchmarkSQL
mainApplicationFile: local:///opt/spark/examples/jars/eks-spark-benchmark-assembly-1.0.jar
arguments:
# TPC-DS data localtion
- "s3://blogpost-sparkoneks-us-east-1/blog/BLOG_TPCDS-TEST-3T-partitioned"
# results location
- "s3://$(BUCKET_PARAM)/OSS_TPCDS-TEST-3T-RESULT"
# Path to kit in the docker image
- "/opt/tpcds-kit/tools"
# Data Format
- "parquet"
# Scale factor (in GB)
- "3000"
# Number of iterations
- "1"
# Optimize queries with hive tables
- "false"
# Filter queries, will run all if empty - "q98-v2.4,q99-v2.4,ss_max-v2.4,q95-v2.4"
- ""
# Logging set to WARN
- "true"
sparkConf:
"spark.network.timeout": "2000s"
"spark.executor.heartbeatInterval": "300s"
# AQE
"spark.sql.adaptive.enabled": "true"
"spark.sql.adaptive.localShuffleReader.enabled": "true"
"spark.sql.adaptive.coalescePartitions.enabled": "true"
"spark.sql.adaptive.skewJoin.enabled": "true"
# "spark.sql.adaptive.logLevel": "WARN"
# IRSA for S3 connection
"spark.kubernetes.executor.podNamePrefix": "oss-spark-tpcds"
"spark.hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider"
"spark.hadoop.fs.s3.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem"
"spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version": "2"
"spark.executor.defaultJavaOptions": "-verbose:gc -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70"
# Keep pods in a single AZ
# "spark.kubernetes.node.selector.topology.kubernetes.io/zone": "us-east-1b"
# "spark.kubernetes.node.selector.eks.amazonaws.com/capacityType": "ON_DEMAND"
driver:
initContainers:
- name: volume-permission
image: public.ecr.aws/docker/library/busybox
command: ['sh', '-c', 'mkdir /ossdata1; chown -R 1000:1000 /ossdata1']
volumeMounts:
- name: spark-local-dir-1
mountPath: /ossdata1
env:
- name: BUCKET_PARAM
valueFrom:
configMapKeyRef:
name: special-config
key: codeBucket
cores: 4
coreLimit: "4.1"
memory: "5g"
memoryOverhead: "1000"
serviceAccount: oss
volumeMounts:
- name: spark-local-dir-1
mountPath: /ossdata1
executor:
initContainers:
- name: volume-permission
image: public.ecr.aws/docker/library/busybox
command: ['sh', '-c', 'mkdir /ossdata1; chown -R 1000:1000 /ossdata1']
volumeMounts:
- name: spark-local-dir-1
mountPath: /ossdata1
cores: 4
coreLimit: "4.3"
memory: "6g"
memoryOverhead: "2g"
# 8 executors per node
instances: 47
volumeMounts:
- name: spark-local-dir-1
mountPath: /ossdata1
volumes:
- name: spark-local-dir-1
hostPath:
path: /local1
restartPolicy:
type: Never