Dockerfile, and Kafka/ Zookeeper config files

mjaglan · Dec 15, 2017 · c782703 · c782703
1 parent 25872c7
commit c782703
Show file tree

Hide file tree

Showing 5 changed files with 202 additions and 0 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,64 @@
+# author [email protected]
+# Coding Style: Shell form
+
+# Start from Ubuntu OS image
+FROM ubuntu:14.04
+
+# set root user
+USER root
+
+# install utilities on up-to-date node
+RUN apt-get update && apt-get -y dist-upgrade && apt-get install -y openssh-server default-jdk wget
+
+# set java home
+ENV JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64
+
+# setup ssh with no passphrase
+RUN ssh-keygen -t rsa -f $HOME/.ssh/id_rsa -P "" \
+    && cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys
+
+# download & extract & move kafka & clean up
+RUN wget -O /kafka.tar.gz -q https://iu.box.com/shared/static/jj9y2p5buaa875f2xejaq6zj94iqp6tn.tgz \
+	&& tar xfz kafka.tar.gz \
+	&& mv /kafka_2.11-0.11.0.1 /usr/local/kafka \
+	&& rm /kafka.tar.gz
+
+# kafka environment variables
+ENV KAFKA_HOME=/usr/local/kafka
+
+# download & extract & move zookeeper & clean up
+RUN wget -O /zookeeper.tar.gz -q https://iu.box.com/shared/static/36magujkse2nc33r865vqitnvymwl0wx.gz \
+	&& tar xfz zookeeper.tar.gz \
+	&& mv /zookeeper-3.4.10 /usr/local/zookeeper \
+	&& rm /zookeeper.tar.gz
+
+# zookeeper environment variables
+ENV ZK_HOME=/usr/local/zookeeper
+
+# setup configs - [standalone, pseudo-distributed mode, fully distributed mode]
+# NOTE: Directly using COPY/ ADD will NOT work if you are NOT using absolute paths inside the docker image.
+# Temporary files: http://refspecs.linuxfoundation.org/FHS_3.0/fhs/ch03s18.html
+COPY config/ /tmp/
+RUN mv /tmp/ssh_config $HOME/.ssh/config \
+    && mv /tmp/server.properties $KAFKA_HOME/config/server.properties \
+    && cp /tmp/zookeeper.properties $ZK_HOME/conf/zoo.cfg \
+    && mv /tmp/zookeeper.properties $KAFKA_HOME/config/zookeeper.properties \
+    && mkdir -p /tmp/zookeeper/ \
+    && mv /tmp/myid /tmp/zookeeper/myid \
+    && rm -rf /tmp/*.template
+
+# Add startup script
+COPY scripts/ /tmp/
+RUN mv /tmp/kafka-services.sh $KAFKA_HOME/kafka-services.sh \
+	&& mv /tmp/zk-services.sh $KAFKA_HOME/zk-services.sh \
+	&& mv /tmp/kafka-health.sh $KAFKA_HOME/kafka-health.sh \
+	&& mv /tmp/zk-health.sh $KAFKA_HOME/zk-health.sh \
+	&& mv /tmp/kafka-benchmarks.sh $KAFKA_HOME/kafka-benchmarks.sh
+
+# set permissions
+RUN chmod 744 -R $KAFKA_HOME
+RUN chmod 744 -R /tmp
+
+# run ssh services
+ENTRYPOINT service ssh start; bash
+
diff --git a/config/myid b/config/myid
@@ -0,0 +1 @@
+1-255
diff --git a/config/server.properties.template b/config/server.properties.template
@@ -0,0 +1,124 @@
+# see kafka.server.KafkaConfig for additional details and defaults
+
+############################# Server Basics #############################
+
+# The id of the broker. This must be set to a unique integer for each broker.
+broker.id=0
+
+# Switch to enable topic deletion or not, default value is false
+delete.topic.enable=true
+
+############################# Socket Server Settings #############################
+
+# The address the socket server listens on. It will get the value returned from 
+# java.net.InetAddress.getCanonicalHostName() if not configured.
+#   FORMAT:
+#     listeners = listener_name://host_name:port
+#   EXAMPLE:
+#     listeners = PLAINTEXT://your.host.name:9092
+#listeners=PLAINTEXT://:9092
+
+# Hostname and port the broker will advertise to producers and consumers. If not set, 
+# it uses the value for "listeners" if configured.  Otherwise, it will use the value
+# returned from java.net.InetAddress.getCanonicalHostName().
+#advertised.listeners=PLAINTEXT://your.host.name:9092
+
+# Maps listener names to security protocols, the default is for them to be the same. See the config documentation for more details
+#listener.security.protocol.map=PLAINTEXT:PLAINTEXT,SSL:SSL,SASL_PLAINTEXT:SASL_PLAINTEXT,SASL_SSL:SASL_SSL
+
+# The number of threads that the server uses for receiving requests from the network and sending responses to the network
+num.network.threads=3
+
+# The number of threads that the server uses for processing requests, which may include disk I/O
+num.io.threads=8
+
+# The send buffer (SO_SNDBUF) used by the socket server
+socket.send.buffer.bytes=102400
+
+# The receive buffer (SO_RCVBUF) used by the socket server
+socket.receive.buffer.bytes=102400
+
+# The maximum size of a request that the socket server will accept (protection against OOM)
+socket.request.max.bytes=104857600
+
+
+############################# Log Basics #############################
+
+# A comma seperated list of directories under which to store log files
+log.dirs=/tmp/kafka-logs
+
+# The default number of log partitions per topic. More partitions allow greater
+# parallelism for consumption, but this will also result in more files across
+# the brokers.
+num.partitions=1
+
+# The number of threads per data directory to be used for log recovery at startup and flushing at shutdown.
+# This value is recommended to be increased for installations with data dirs located in RAID array.
+num.recovery.threads.per.data.dir=1
+
+############################# Internal Topic Settings  #############################
+# The replication factor for the group metadata internal topics "__consumer_offsets" and "__transaction_state"
+# For anything other than development testing, a value greater than 1 is recommended for to ensure availability such as 3.
+offsets.topic.replication.factor=1
+transaction.state.log.replication.factor=1
+transaction.state.log.min.isr=1
+
+############################# Log Flush Policy #############################
+
+# Messages are immediately written to the filesystem but by default we only fsync() to sync
+# the OS cache lazily. The following configurations control the flush of data to disk.
+# There are a few important trade-offs here:
+#    1. Durability: Unflushed data may be lost if you are not using replication.
+#    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
+#    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks.
+# The settings below allow one to configure the flush policy to flush data after a period of time or
+# every N messages (or both). This can be done globally and overridden on a per-topic basis.
+
+# The number of messages to accept before forcing a flush of data to disk
+#log.flush.interval.messages=10000
+
+# The maximum amount of time a message can sit in a log before we force a flush
+#log.flush.interval.ms=1000
+
+############################# Log Retention Policy #############################
+
+# The following configurations control the disposal of log segments. The policy can
+# be set to delete segments after a period of time, or after a given size has accumulated.
+# A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
+# from the end of the log.
+
+# The minimum age of a log file to be eligible for deletion due to age
+log.retention.hours=168
+
+# A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
+# segments don't drop below log.retention.bytes. Functions independently of log.retention.hours.
+#log.retention.bytes=1073741824
+
+# The maximum size of a log segment file. When this size is reached a new log segment will be created.
+log.segment.bytes=1073741824
+
+# The interval at which log segments are checked to see if they can be deleted according
+# to the retention policies
+log.retention.check.interval.ms=300000
+
+############################# Zookeeper #############################
+
+# Zookeeper connection string (see zookeeper docs for details).
+# This is a comma separated host:port pairs, each corresponding to a zk
+# server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002".
+# You can also append an optional chroot string to the urls to specify the
+# root directory for all kafka znodes.
+zookeeper.connect=localhost:2181
+
+# Timeout in ms for connecting to zookeeper
+zookeeper.connection.timeout.ms=6000
+
+
+############################# Group Coordinator Settings #############################
+
+# The following configuration specifies the time, in milliseconds, that the GroupCoordinator will delay the initial consumer rebalance.
+# The rebalance will be further delayed by the value of group.initial.rebalance.delay.ms as new members join the group, up to a maximum of max.poll.interval.ms.
+# The default value for this is 3 seconds.
+# We override this to 0 here as it makes for a better out-of-the-box experience for development and testing.
+# However, in production environments the default value of 3 seconds is more suitable as this will help to avoid unnecessary, and potentially expensive, rebalances during application startup.
+group.initial.rebalance.delay.ms=0
diff --git a/config/ssh_config b/config/ssh_config
@@ -0,0 +1,3 @@
+Host *
+  StrictHostKeyChecking no
+  UserKnownHostsFile /dev/null
diff --git a/config/zookeeper.properties.template b/config/zookeeper.properties.template
@@ -0,0 +1,10 @@
+# the directory where the snapshot is stored.
+dataDir=/tmp/zookeeper
+# the port at which the clients will connect
+clientPort=2181
+# disable the per-ip limit on the number of connections since this is a non-production config
+# maxClientCnxns=0
+initLimit=5
+syncLimit=2
+tickTime=2000
+# add zookeeper servers below