-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathDockerfile
77 lines (62 loc) · 2.59 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
FROM ubuntu:18.04
WORKDIR /root
RUN apt-get update && apt-get install -y \
openssh-server \
nano \
openjdk-8-jdk
RUN apt-get install -y python3 && \
apt-get install -y python3-pip && \
ln -s /usr/bin/python3 /usr/bin/python && \
rm -rf /var/lib/apt/lists/*
RUN pip3 install --upgrade setuptools && \
pip3 install pyspark && \
pip3 install jupyter
# download hadoop
RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-2.7.7/hadoop-2.7.7.tar.gz && \
tar -xzf hadoop-2.7.7.tar.gz && \
mv hadoop-2.7.7 /usr/local/hadoop && \
rm hadoop-2.7.7.tar.gz
# download spark
RUN wget https://archive.apache.org/dist/spark/spark-3.2.4/spark-3.2.4-bin-hadoop2.7.tgz && \
tar -xzf spark-3.2.4-bin-hadoop2.7.tgz && \
mv spark-3.2.4-bin-hadoop2.7 /usr/local/spark && \
rm spark-3.2.4-bin-hadoop2.7.tgz
# set environment vars
ENV HADOOP_HOME=/usr/local/hadoop
ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
ENV SPARK_HOME=/usr/local/spark
ENV SPARK_MASTER_PORT 7077
ENV HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
ENV LD_LIBRARY_PATH=/usr/local/hadoop/lib/native
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
ENV PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:/usr/local/spark/bin
# ssh without key
RUN ssh-keygen -t rsa -f ~/.ssh/id_rsa -P '' && \
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
chmod 0600 ~/.ssh/authorized_keys
# copy hadoop configs
COPY /spark_configs/* /tmp/
RUN mv /tmp/ssh_config ~/.ssh/config && \
mv /tmp/hadoop-env.sh $HADOOP_HOME/etc/hadoop/hadoop-env.sh && \
mv /tmp/hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml && \
mv /tmp/core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml && \
mv /tmp/mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml && \
mv /tmp/yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml && \
mv /tmp/slaves $HADOOP_HOME/etc/hadoop/slaves && \
mv /tmp/workers $SPARK_HOME/conf/workers && \
mv /tmp/spark-env.sh $SPARK_HOME/conf/spark-env.sh && \
mv /tmp/spark-default.conf $SPARK_HOME/conf/spark-default.conf && \
mv /tmp/start-cluster.sh ~/start-cluster.sh
# create spark-events directory
RUN mkdir /tmp/spark-events && \
chmod 777 /tmp/spark-events
RUN chmod u+r+x ~/start-cluster.sh
# remove CRLF
RUN sed -i 's/\r$//g' $HADOOP_HOME/etc/hadoop/slaves
RUN sed -i 's/\r$//g' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN sed -i 's/\r$//g' $SPARK_HOME/conf/workers
RUN sed -i 's/\r$//g' $SPARK_HOME/conf/spark-default.conf
RUN sed -i 's/\r$//g' $SPARK_HOME/conf/spark-env.sh
RUN sed -i 's/\r$//g' ~/start-cluster.sh
# format namenode
RUN $HADOOP_HOME/bin/hdfs namenode -format