-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathtpcds-data-generation.yaml
93 lines (92 loc) · 3.08 KB
/
tpcds-data-generation.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# // SPDX-License-Identifier: MIT-0
apiVersion: "sparkoperator.k8s.io/v1beta2"
kind: SparkApplication
metadata:
name: tpcds-data-generation-3t
namespace: oss
spec:
type: Scala
mode: cluster
image: ghcr.io/aws-samples/eks-spark-benchmark:3.1.2
imagePullPolicy: IfNotPresent
sparkVersion: 3.1.2
mainClass: com.amazonaws.eks.tpcds.DataGeneration
mainApplicationFile: local:///opt/spark/examples/jars/eks-spark-benchmark-assembly-1.0.jar
arguments:
# TPC-DS data localtion
- "s3://$(BUCKET_PARAM)/BLOG_TPCDS-TEST-3T-partitioned"
# Path to kit in the docker image
- "/opt/tpcds-kit/tools"
# Data Format
- "parquet"
# Scale factor (in GB)
- "3000"
# Generate data num partitions
- "200"
# Create the partitioned fact tables
- "true"
# Shuffle to get partitions coalesced into single files.
- "true"
# Logging set to WARN
- "true"
sparkConf:
"spark.network.timeout": "2000s"
"spark.executor.heartbeatInterval": "300s"
"spark.kubernetes.memoryOverheadFactor": "0.3"
"spark.sql.files.maxRecordsPerFile": "30000000"
"spark.serializer": "org.apache.spark.serializer.KryoSerializer"
# "spark.kubernetes.node.selector.eks.amazonaws.com/capacityType": "ON_DEMAND"
# "spark.kubernetes.node.selector.topology.kubernetes.io/zone": "us-west-2b"
# S3 settings
"spark.hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider"
"spark.hadoop.fs.s3a.fast.upload": "true"
"spark.hadoop.fs.s3a.path.style.access": "true"
"spark.hadoop.fs.s3.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem"
"spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version": "2"
"spark.kubernetes.executor.podNamePrefix": "oss-data-gen"
"spark.executor.defaultJavaOptions": "-verbose:gc -XX:+UseG1GC"
"spark.driver.defaultJavaOptions": "-XX:+UseG1GC"
driver:
initContainers:
- name: volume-permission
image: public.ecr.aws/docker/library/busybox
command: ['sh', '-c', 'mkdir /ossdata1; chown -R 1000:1000 /ossdata1']
volumeMounts:
- name: spark-local-dir-1
mountPath: /ossdata1
env:
- name: BUCKET_PARAM
valueFrom:
configMapKeyRef:
name: special-config
key: codeBucket
cores: 10
coreLimit: "10.1"
memory: "10g"
serviceAccount: oss
volumeMounts:
- name: "spark-local-dir-1"
mountPath: "/ossdata1"
executor:
initContainers:
- name: volume-permission
image: public.ecr.aws/docker/library/busybox
command: ['sh', '-c', 'mkdir /ossdata1; chown -R 1000:1000 /ossdata1']
volumeMounts:
- name: spark-local-dir-1
mountPath: /ossdata1
cores: 11
coreLimit: "11.1"
memory: "15g"
# 3 executors per node 9 nodes
instances: 26
volumeMounts:
- name: spark-local-dir-1
mountPath: /ossdata1
volumes:
- name: spark-local-dir-1
hostPath:
path: /local1
restartPolicy:
type: Never