您现在的位置是:首页 >学无止境 >【大数据分析】Hadoop 虚拟化部署网站首页学无止境
【大数据分析】Hadoop 虚拟化部署
简介【大数据分析】Hadoop 虚拟化部署
目录
镜像制作
镜像地址
Hadoop HA 高可用镜像地址:registry.cn-shenzhen.aliyuncs.com/edata/hadoop:v1.5
Hadoop 单机镜像地址:registry.cn-shenzhen.aliyuncs.com/edata/hadoop_standalone:v1
DockerFile
FROM ubuntu:20.04
#RUN sed -i 's@http://archive.ubuntu.com/ubuntu/@http://mirrors.aliyun.com/ubuntu/@g' /etc/apt/sources.list
RUN mkdir -p /root/directory
WORKDIR /root/directory
COPY . /root/directory
RUN tar -xvf hadoop-3.3.4.tar.gz
&& tar -xvf jdk-8u221-linux-x64.tar.gz
&& rm hadoop-3.3.4.tar.gz
&& rm jdk-8u221-linux-x64.tar.gz
&& chmod -R 777 hadoop-3.3.4
&& chmod -R 777 jdk1.8.0_221
&& chmod -R 777 entrypoint.sh
&& mv hadoop-3.3.4 hadoop
&& mv jdk1.8.0_221 jdk
&& mkdir -p /root/directory/hadoop/hdfs/data
&& mkdir -p /root/directory/hadoop/hdfs/name
&& mkdir -p /root/directory/hadoop/hdfs/journal
&& mkdir -p /root/directory/hadoop/tmp
&& apt-get update
&& apt-get install -y netcat
&& apt-get install -y psmisc
&& apt-get purge --autoremove
&& apt-get clean
ENV HADOOP_HOME=/root/directory/hadoop
ENV JAVA_HOME=/root/directory/jdk
ENV CLASSPATH=${JAVA_HOME}/lib:${JAVA_HOME}/jre/lib:$CLASSPATH
ENV PATH=${JAVA_HOME}/bin:${JAVA_HOME}/jre/bin:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:$PATH
VOLUME [ "/root/directory/hadoop/hdfs/data","/root/directory/hadoop/hdfs/name","/root/directory/hadoop/hdfs/journal","/root/directory/hadoop/tmp" ]
ENTRYPOINT [ "./entrypoint.sh" ]
entrypoint.sh
Dockerfile中有一个entrypoint.sh文件,用于启动相关进程,它有两个版本,对应Hadoop HA和standalone两种启动模式启动Hadoop
高可用版本
#!/bin/bash
function addProperty() {
local path=$1
local name=$2
local value=$3
local entry="<property><name>$name</name><value>${value}</value></property>"
local escapedEntry=$(echo $entry | sed 's///\//g')
sed -i "/</configuration>/ s/.*/${escapedEntry}
&/" $path
}
function setConfig() {
local conf_file=$1
local module=$2
local env_prefix=$3
for c in `printenv | perl -sne 'print "$1 " if m/^${env_prefix}_(.+?)=.*/' -- -env_prefix=$env_prefix`;do
name=`echo ${c} | perl -pe 's/___/-/g; s/__/@/g; s/_/./g; s/@/_/g;'`
var="${env_prefix}_${c}"
value=${!var}
echo " - Setting $name=$value"
addProperty /root/directory/hadoop/etc/hadoop/$module-site.xml $name $value
done
}
function wait_for_it() {
local ipport=$1
local ip=${ipport%%:*}
local port=${ipport#*:}
local retry_seconds=5
local max_try=5
let i=1
nc -z $ip $port
result=$?
until [ $result -eq 0 ]; do
echo "[$i/$max_try] check for ${ip}:${port}..."
echo "[$i/$max_try] ${service}:${port} is not available yet"
if (($i == $max_try )); then
echo "[$i/$max_try] ${service}:${port} is still not available;giving up..."
exit 1
fi
echo "[$i/$max_try] try in ${retry_seconds}s once again..."
let "i++"
sleep $retry_seconds
nc -z $ip $port
result=$?
done
echo "[$i/$max_try] $ip:$port is available."
}
setConfig /root/directory/hadoop/etc/hadoop/core-site.xml core CORE
setConfig /root/directory/hadoop/etc/hadoop/hdfs-site.xml hdfs HDFS
setConfig /root/directory/hadoop/etc/hadoop/yarn-site.xml yarn YARN
setConfig /root/directory/hadoop/etc/hadoop/mapred-site.xml mapred MAPRED
sed -i "s|localhost|$WORKERS|g" /root/directory/hadoop/etc/hadoop/workers
for i in ${SERVICE_PRECONDITION[@]}; do
wait_for_it ${i}
done
if [ $ROLE == "namenode" ];then
$HADOOP_HOME/bin/hdfs haadmin -getAllServiceState >> namenodestate.txt
if [ "`ls -A $HDFS_dfs_namenode_name_dir`" == "" ];then
if [ "`cat namenodestate.txt | grep standby`" != "" ];then
echo "Syncing data of namenode..."
$HADOOP_HOME/bin/hdfs namenode -bootstrapStandby
else
echo "Formatting namenode directory:$HDFS_dfs_namenode_name_dir"
$HADOOP_HOME/bin/hdfs namenode -format
echo "Formatting zookeeper with zkfc..."
$HADOOP_HOME/bin/hdfs zkfc -formatZK
fi
fi
echo "Starting namenode..."
$HADOOP_HOME/bin/hdfs --daemon start namenode
echo "Starting zkfc..."
$HADOOP_HOME/bin/hdfs zkfc
fi
if [ $ROLE == "datanode" ];then
echo "Starting datanode..."
$HADOOP_HOME/bin/hdfs datanode
fi
if [ $ROLE == "resourcemanager" ]; then
echo "Starting resourcemanager..."
$HADOOP_HOME/bin/yarn resourcemanager
fi
if [ $ROLE == "nodemanager" ]; then
echo "Starting nodemanager..."
$HADOOP_HOME/bin/yarn nodemanager
fi
if [ $ROLE == "journalnode" ];then
echo "Starting journalnode..."
$HADOOP_HOME/bin/hdfs journalnode
fi
tail -f /dev/null
单机版本
#!/bin/bash
function addProperty() {
local path=$1
local name=$2
local value=$3
local entry="<property><name>$name</name><value>${value}</value></property>"
local escapedEntry=$(echo $entry | sed 's///\//g')
sed -i "/</configuration>/ s/.*/${escapedEntry}
&/" $path
}
function setConfig() {
local conf_file=$1
local module=$2
local env_prefix=$3
for c in `printenv | perl -sne 'print "$1 " if m/^${env_prefix}_(.+?)=.*/' -- -env_prefix=$env_prefix`;do
name=`echo ${c} | perl -pe 's/___/-/g; s/__/@/g; s/_/./g; s/@/_/g;'`
var="${env_prefix}_${c}"
value=${!var}
echo " - Setting $name=$value"
addProperty /root/directory/hadoop/etc/hadoop/$module-site.xml $name $value
done
}
function wait_for_it() {
local ipport=$1
local ip=${ipport%%:*}
local port=${ipport#*:}
local retry_seconds=5
local max_try=5
let i=1
nc -z $ip $port
result=$?
until [ $result -eq 0 ]; do
echo "[$i/$max_try] check for ${ip}:${port}..."
echo "[$i/$max_try] ${service}:${port} is not available yet"
if (($i == $max_try )); then
echo "[$i/$max_try] ${service}:${port} is still not available;giving up..."
exit 1
fi
echo "[$i/$max_try] try in ${retry_seconds}s once again..."
let "i++"
sleep $retry_seconds
nc -z $ip $port
result=$?
done
echo "[$i/$max_try] $ip:$port is available."
}
setConfig /root/directory/hadoop/etc/hadoop/core-site.xml core CORE
setConfig /root/directory/hadoop/etc/hadoop/hdfs-site.xml hdfs HDFS
setConfig /root/directory/hadoop/etc/hadoop/yarn-site.xml yarn YARN
setConfig /root/directory/hadoop/etc/hadoop/mapred-site.xml mapred MAPRED
sed -i "s|localhost|$WORKERS|g" /root/directory/hadoop/etc/hadoop/workers
for i in ${SERVICE_PRECONDITION[@]}; do
wait_for_it ${i}
done
if [ $ROLE == "namenode" ];then
if [ "`ls -A $HDFS_dfs_namenode_name_dir`" == "" ];then
echo "Formatting namenode directory:$HDFS_dfs_namenode_name_dir"
$HADOOP_HOME/bin/hdfs namenode -format
fi
echo "Starting namenode..."
$HADOOP_HOME/bin/hdfs --daemon start namenode
fi
if [ $ROLE == "secondarynamenode" ]; then
echo "Starting Secondarynamenode..."
$HADOOP_HOME/bin/hdfs secondarynamenode
fi
if [ $ROLE == "datanode" ];then
echo "Starting datanode..."
$HADOOP_HOME/bin/hdfs datanode
fi
if [ $ROLE == "resourcemanager" ]; then
echo "Starting resourcemanager..."
$HADOOP_HOME/bin/yarn resourcemanager
fi
if [ $ROLE == "nodemanager" ]; then
echo "Starting nodemanager..."
$HADOOP_HOME/bin/yarn nodemanager
fi
if [ $ROLE == "journalnode" ];then
echo "Starting journalnode..."
$HADOOP_HOME/bin/hdfs journalnode
fi
tail -f /dev/null
Hadoop容器启动
单机模式
单机模式只需要将hadoop中的namenode,secondarynamenode,datanode,resourcemanager,nodemanager的容器启动便可。
compose 启动脚本
version: "3.1"
services:
namenode:
image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop_standalone:v1
container_name: namenode
restart: always
environment:
- ROLE=namenode
volumes:
- /home/smart/edata_docker/hdfs_name/:/root/directory/hadoop/hdfs/name
- /home/smart/edata_docker/hdfs_tmp/:/root/directory/hadoop/tmp
env_file:
- ./hadoop.env
network_mode: "host"
secondarynamenode:
image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop_standalone:v1
container_name: secondarynamenode
restart: always
environment:
- ROLE=secondarynamenode
- SERVICE_PRECONDITION=bigdata01:8082
env_file:
- ./hadoop.env
network_mode: "host"
datanode:
image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop_standalone:v1
container_name: datanode
restart: always
environment:
- ROLE=datanode
- SERVICE_PRECONDITION=bigdata01:8082
volumes:
- /home/smart/edata_docker/hdfs_data/:/root/directory/hadoop/hdfs/data
- /home/smart/edata_docker/hdfs_tmp/:/root/directory/hadoop/tmp
env_file:
- ./hadoop.env
network_mode: "host"
resourcemanager:
image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop_standalone:v1
container_name: resourcemanager
restart: always
environment:
- ROLE=resourcemanager
- SERVICE_PRECONDITION=bigdata01:8082
env_file:
- ./hadoop.env
network_mode: "host"
nodemanager:
image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop_standalone:v1
container_name: nodemanager
restart: always
environment:
- ROLE=nodemanager
- SERVICE_PRECONDITION=bigdata01:8082
env_file:
- ./hadoop.env
network_mode: "host"
hadoop.env 配置
CORE_fs_defaultFS=hdfs://bigdata01:8082
CORE_hadoop_tmp_dir=/root/directory/hadoop/tmp
HDFS_dfs_namenode_http___address=bigdata01:9870
HDFS_dfs_namenode_secondary_http___address=bigdata01:9868
HDFS_dfs_namenode_name_dir=/root/directory/hadoop/hdfs/name
HDFS_dfs_datanode_data_dir=/root/directory/hadoop/hdfs/data
HDFS_dfs_permissions_enabled=false
YARN_yarn_nodemanager_aux___services=mapreduce_shuffle
YARN_yarn_resourcemanager_scheduler_address=bigdata01:8030
YARN_yarn_resourcemanager_resource___tracker_address=bigdata01:8031
YARN_yarn_resourcemanager_address=bigdata01:8032
YARN_yarn_resourcemanager_admin_address=bigdata01:8033
YARN_yarn_resourcemanager_webapp_address=bigdata01:8088
MAPRED_mapreduce_framework_name=yarn
MAPRED_mapreduce_jobhistory_address=bigdata01:10020
HA 高可用模式
hadoop 的HA高可用部署需要依赖Zookeeper,关于Zookeeper的集群部署在此处不赘述。HA模式下需要在集群中不同节点分别部署namenode(一般与ZKFC在同一个),datanode,journalnode,resourcemanager,nodemanager。
集群规划
NameNode 和 ZKFC | DataNode | JournalNode | ResourceManager | NodeManager | |
---|---|---|---|---|---|
172.x.x.97 | * | * | * | * | * |
172.x.x.98 | * | * | * | * | * |
172.x.x.99 | * | * | * | * |
compose 启动脚本
journalnode
version: "3.1"
services:
journalnode:
image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop:v1.5
container_name: journalnode
restart: always
environment:
- ROLE=journalnode
volumes:
- /home/smart/edata_docker/hdfs_journal/:/root/directory/hadoop/hdfs/journal
env_file:
- ./hadoop_ha.env
network_mode: "host"
namenode 和 zkfc
version: "3.1"
services:
namenode:
image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop:v1.5
container_name: namenode
restart: always
environment:
- ROLE=namenode
volumes:
- /home/smart/edata_docker/hdfs_name/:/root/directory/hadoop/hdfs/name
- /home/smart/edata_docker/hdfs_tmp/:/root/directory/hadoop/tmp
env_file:
- ./hadoop_ha.env
network_mode: "host"
datanode
version: "3.1"
services:
datanode:
image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop:v1.5
container_name: datanode
restart: always
environment:
- ROLE=datanode
volumes:
- /home/smart/edata_docker/hdfs_data/:/root/directory/hadoop/hdfs/data
- /home/smart/edata_docker/hdfs_tmp/:/root/directory/hadoop/tmp
env_file:
- ./hadoop_ha.env
network_mode: "host"
resourcemanager
version: "3.1"
services:
resourcemanager:
image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop:v1.5
container_name: resourcemanager
restart: always
environment:
- ROLE=resourcemanager
env_file:
- ./hadoop_ha.env
network_mode: "host"
nodemanager
version: "3.1"
services:
nodemanager:
image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop:v1.5
container_name: nodemanager
restart: always
environment:
- ROLE=nodemanager
env_file:
- ./hadoop_ha.env
network_mode: "host"
hadoop_ha.env 配置
CORE_fs_defaultFS=hdfs://edata
CORE_hadoop_tmp_dir=/root/directory/hadoop/tmp
CORE_ha_zookeeper_quorum=bigdata01:2181,bigdata02:2181,bigdata03:2181
HDFS_dfs_nameservices=edata
HDFS_dfs_ha_namenodes_edata=nn1,nn2
HDFS_dfs_namenode_rpc___address_edata_nn1=bigdata01:8082
HDFS_dfs_namenode_rpc___address_edata_nn2=bigdata02:8082
HDFS_dfs_namenode_http___address_edata_nn1=bigdata01:9870
HDFS_dfs_namenode_http___address_edata_nn2=bigdata02:9870
HDFS_dfs_namenode_name_dir=/root/directory/hadoop/hdfs/name
HDFS_dfs_namenode_shared_edits_dir=qjournal://bigdata01:8485;bigdata02:8485;bigdata03:8485/edata
HDFS_dfs_ha_fencing_methods=sshfencenshell(/bin/true)n
HDFS_dfs_ha_fencing_ssh_private___key___files=/root/.ssh/id_rsa
HDFS_dfs_ha_automatic___failover_enabled=true
HDFS_dfs_client_failover_proxy_provider_edata=org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
HDFS_dfs_datanode_data_dir=/root/directory/hadoop/hdfs/data
HDFS_dfs_journalnode_edits_dir=/root/directory/hadoop/hdfs/journal
HDFS_dfs_replication=3
HDFS_dfs_permissions_enabled=false
YARN_yarn_resourcemanager_ha_enabled=true
YARN_yarn_resourcemanager_cluster___id=edatayarn
YARN_yarn_resourcemanager_ha_rm___ids=rm1,rm2,rm3
YARN_yarn_resourcemanager_hostname_rm1=bigdata01
YARN_yarn_resourcemanager_hostname_rm2=bigdata02
YARN_yarn_resourcemanager_hostname_rm3=bigdata03
YARN_yarn_resourcemanager_webapp_address_rm1=bigdata01:8088
YARN_yarn_resourcemanager_webapp_address_rm2=bigdata02:8088
YARN_yarn_resourcemanager_webapp_address_rm3=bigdata03:8088
YARN_yarn_resourcemanager_recovery_enabled=true
YARN_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore
YARN_yarn_nodemanager_aux___services=mapreduce_shuffle
YARN_yarn_nodemanager_pmem___check___enabled=false
YARN_yarn_nodemanager_vmem___check___enabled=false
YARN_hadoop_zk_address=bigdata01:2181,bigdata02:2181,bigdata03:2181
MAPRED_mapreduce_framework_name=yarn
MAPRED_mapreduce_jobhistory_address=bigdata01:10020
风语者!平时喜欢研究各种技术,目前在从事后端开发工作,热爱生活、热爱工作。