您现在的位置是:首页 >学无止境 >【大数据分析】Hadoop 虚拟化部署网站首页学无止境

【大数据分析】Hadoop 虚拟化部署

sword_csdn 2024-07-09 12:01:02
简介【大数据分析】Hadoop 虚拟化部署

镜像制作

镜像地址

Hadoop HA 高可用镜像地址:registry.cn-shenzhen.aliyuncs.com/edata/hadoop:v1.5
Hadoop 单机镜像地址:registry.cn-shenzhen.aliyuncs.com/edata/hadoop_standalone:v1

DockerFile

FROM ubuntu:20.04
#RUN sed -i 's@http://archive.ubuntu.com/ubuntu/@http://mirrors.aliyun.com/ubuntu/@g' /etc/apt/sources.list
RUN mkdir -p /root/directory
WORKDIR /root/directory
COPY . /root/directory
RUN tar -xvf hadoop-3.3.4.tar.gz 
  && tar -xvf jdk-8u221-linux-x64.tar.gz 
  && rm hadoop-3.3.4.tar.gz 
  && rm jdk-8u221-linux-x64.tar.gz 
  && chmod -R 777 hadoop-3.3.4 
  && chmod -R 777 jdk1.8.0_221 
  && chmod -R 777 entrypoint.sh 
  && mv hadoop-3.3.4 hadoop 
  && mv jdk1.8.0_221 jdk 
  && mkdir -p /root/directory/hadoop/hdfs/data 
  && mkdir -p /root/directory/hadoop/hdfs/name 
  && mkdir -p /root/directory/hadoop/hdfs/journal 
  && mkdir -p /root/directory/hadoop/tmp 
  && apt-get update 
  && apt-get install -y netcat 
  && apt-get install -y psmisc 
  && apt-get purge --autoremove 
  && apt-get clean
ENV HADOOP_HOME=/root/directory/hadoop
ENV JAVA_HOME=/root/directory/jdk
ENV CLASSPATH=${JAVA_HOME}/lib:${JAVA_HOME}/jre/lib:$CLASSPATH
ENV PATH=${JAVA_HOME}/bin:${JAVA_HOME}/jre/bin:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:$PATH
VOLUME [ "/root/directory/hadoop/hdfs/data","/root/directory/hadoop/hdfs/name","/root/directory/hadoop/hdfs/journal","/root/directory/hadoop/tmp" ]
ENTRYPOINT [ "./entrypoint.sh" ]

entrypoint.sh

Dockerfile中有一个entrypoint.sh文件,用于启动相关进程,它有两个版本,对应Hadoop HA和standalone两种启动模式启动Hadoop

高可用版本

#!/bin/bash
function addProperty() {
  local path=$1
  local name=$2
  local value=$3
  local entry="<property><name>$name</name><value>${value}</value></property>"
  local escapedEntry=$(echo $entry | sed 's///\//g')
  sed -i "/</configuration>/ s/.*/${escapedEntry}
&/" $path
}
function setConfig() {
    local conf_file=$1
    local module=$2
    local env_prefix=$3
    for c in `printenv | perl -sne 'print "$1 " if m/^${env_prefix}_(.+?)=.*/' -- -env_prefix=$env_prefix`;do
        name=`echo ${c} | perl -pe 's/___/-/g; s/__/@/g; s/_/./g; s/@/_/g;'`
        var="${env_prefix}_${c}"
        value=${!var}
        echo " - Setting $name=$value"
        addProperty /root/directory/hadoop/etc/hadoop/$module-site.xml $name $value
    done
}
function wait_for_it() {
  local ipport=$1
  local ip=${ipport%%:*}
  local port=${ipport#*:}
  local retry_seconds=5
  local max_try=5
  let i=1
  nc -z $ip $port
  result=$?
  until [ $result -eq 0 ]; do
    echo "[$i/$max_try] check for ${ip}:${port}..."
    echo "[$i/$max_try] ${service}:${port} is not available yet"
    if (($i == $max_try )); then
      echo "[$i/$max_try] ${service}:${port} is still not available;giving up..."
      exit 1
    fi
    echo "[$i/$max_try] try in ${retry_seconds}s once again..."
    let "i++"
    sleep $retry_seconds
    nc -z $ip $port
    result=$?
  done
  echo "[$i/$max_try] $ip:$port is available."
}
setConfig /root/directory/hadoop/etc/hadoop/core-site.xml core CORE
setConfig /root/directory/hadoop/etc/hadoop/hdfs-site.xml hdfs HDFS
setConfig /root/directory/hadoop/etc/hadoop/yarn-site.xml yarn YARN
setConfig /root/directory/hadoop/etc/hadoop/mapred-site.xml mapred MAPRED
sed -i "s|localhost|$WORKERS|g" /root/directory/hadoop/etc/hadoop/workers

for i in ${SERVICE_PRECONDITION[@]}; do
  wait_for_it ${i}
done

if [ $ROLE == "namenode" ];then
  $HADOOP_HOME/bin/hdfs haadmin -getAllServiceState >>  namenodestate.txt
  if [ "`ls -A $HDFS_dfs_namenode_name_dir`" == "" ];then
    if [ "`cat namenodestate.txt | grep standby`" != "" ];then
      echo "Syncing data of namenode..."
      $HADOOP_HOME/bin/hdfs namenode -bootstrapStandby
    else
      echo "Formatting namenode directory:$HDFS_dfs_namenode_name_dir"
      $HADOOP_HOME/bin/hdfs namenode -format
      echo "Formatting zookeeper with zkfc..."
      $HADOOP_HOME/bin/hdfs zkfc -formatZK
    fi
  fi
  echo "Starting namenode..."
  $HADOOP_HOME/bin/hdfs --daemon start namenode
  echo "Starting zkfc..."
  $HADOOP_HOME/bin/hdfs zkfc
fi

if [ $ROLE == "datanode" ];then
  echo "Starting datanode..."
  $HADOOP_HOME/bin/hdfs datanode
fi

if [ $ROLE == "resourcemanager" ]; then
  echo "Starting resourcemanager..."
  $HADOOP_HOME/bin/yarn resourcemanager
fi

if [ $ROLE == "nodemanager" ]; then
  echo "Starting nodemanager..."
  $HADOOP_HOME/bin/yarn nodemanager
fi

if [ $ROLE == "journalnode" ];then
  echo "Starting journalnode..."
  $HADOOP_HOME/bin/hdfs journalnode
fi

tail -f  /dev/null

单机版本

#!/bin/bash
function addProperty() {
  local path=$1
  local name=$2
  local value=$3
  local entry="<property><name>$name</name><value>${value}</value></property>"
  local escapedEntry=$(echo $entry | sed 's///\//g')
  sed -i "/</configuration>/ s/.*/${escapedEntry}
&/" $path
}


function setConfig() {
    local conf_file=$1
    local module=$2
    local env_prefix=$3
    for c in `printenv | perl -sne 'print "$1 " if m/^${env_prefix}_(.+?)=.*/' -- -env_prefix=$env_prefix`;do
        name=`echo ${c} | perl -pe 's/___/-/g; s/__/@/g; s/_/./g; s/@/_/g;'`
        var="${env_prefix}_${c}"
        value=${!var}
        echo " - Setting $name=$value"
        addProperty /root/directory/hadoop/etc/hadoop/$module-site.xml $name $value
    done
}

function wait_for_it() {
  local ipport=$1
  local ip=${ipport%%:*}
  local port=${ipport#*:}
  local retry_seconds=5
  local max_try=5
  let i=1
  nc -z $ip $port
  result=$?
  until [ $result -eq 0 ]; do
    echo "[$i/$max_try] check for ${ip}:${port}..."
    echo "[$i/$max_try] ${service}:${port} is not available yet"
    if (($i == $max_try )); then
      echo "[$i/$max_try] ${service}:${port} is still not available;giving up..."
      exit 1
    fi
    echo "[$i/$max_try] try in ${retry_seconds}s once again..."
    let "i++"
    sleep $retry_seconds
    nc -z $ip $port
    result=$?
  done
  echo "[$i/$max_try] $ip:$port is available."
}

setConfig /root/directory/hadoop/etc/hadoop/core-site.xml core CORE
setConfig /root/directory/hadoop/etc/hadoop/hdfs-site.xml hdfs HDFS
setConfig /root/directory/hadoop/etc/hadoop/yarn-site.xml yarn YARN
setConfig /root/directory/hadoop/etc/hadoop/mapred-site.xml mapred MAPRED
sed -i "s|localhost|$WORKERS|g" /root/directory/hadoop/etc/hadoop/workers

for i in ${SERVICE_PRECONDITION[@]}; do
  wait_for_it ${i}
done

if [ $ROLE == "namenode" ];then
  if [ "`ls -A $HDFS_dfs_namenode_name_dir`" == "" ];then
    echo "Formatting namenode directory:$HDFS_dfs_namenode_name_dir"
    $HADOOP_HOME/bin/hdfs namenode -format
  fi
  echo "Starting namenode..."
  $HADOOP_HOME/bin/hdfs --daemon start namenode
fi

if [ $ROLE == "secondarynamenode" ]; then
  echo "Starting Secondarynamenode..."
  $HADOOP_HOME/bin/hdfs secondarynamenode
fi

if [ $ROLE == "datanode" ];then
  echo "Starting datanode..."
  $HADOOP_HOME/bin/hdfs datanode
fi

if [ $ROLE == "resourcemanager" ]; then
  echo "Starting resourcemanager..."
  $HADOOP_HOME/bin/yarn resourcemanager
fi

if [ $ROLE == "nodemanager" ]; then
  echo "Starting nodemanager..."
  $HADOOP_HOME/bin/yarn nodemanager
fi

if [ $ROLE == "journalnode" ];then
  echo "Starting journalnode..."
  $HADOOP_HOME/bin/hdfs journalnode
fi

tail -f  /dev/null

Hadoop容器启动

单机模式

单机模式只需要将hadoop中的namenode,secondarynamenode,datanode,resourcemanager,nodemanager的容器启动便可。

compose 启动脚本

version: "3.1"
services:
  namenode:
    image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop_standalone:v1
    container_name: namenode
    restart: always
    environment:
      - ROLE=namenode
    volumes:
      - /home/smart/edata_docker/hdfs_name/:/root/directory/hadoop/hdfs/name
      - /home/smart/edata_docker/hdfs_tmp/:/root/directory/hadoop/tmp
    env_file:
      - ./hadoop.env
    network_mode: "host"

  secondarynamenode:
    image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop_standalone:v1
    container_name: secondarynamenode
    restart: always
    environment:
      - ROLE=secondarynamenode
      - SERVICE_PRECONDITION=bigdata01:8082
    env_file:
      - ./hadoop.env
    network_mode: "host"

  datanode:
    image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop_standalone:v1
    container_name: datanode
    restart: always
    environment:
      - ROLE=datanode
      - SERVICE_PRECONDITION=bigdata01:8082
    volumes:
      - /home/smart/edata_docker/hdfs_data/:/root/directory/hadoop/hdfs/data
      - /home/smart/edata_docker/hdfs_tmp/:/root/directory/hadoop/tmp
    env_file:
      - ./hadoop.env
    network_mode: "host"

  resourcemanager:
    image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop_standalone:v1
    container_name: resourcemanager
    restart: always
    environment:
      - ROLE=resourcemanager
      - SERVICE_PRECONDITION=bigdata01:8082
    env_file:
      - ./hadoop.env
    network_mode: "host"

  nodemanager:
    image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop_standalone:v1
    container_name: nodemanager
    restart: always
    environment:
      - ROLE=nodemanager
      - SERVICE_PRECONDITION=bigdata01:8082
    env_file:
      - ./hadoop.env
    network_mode: "host"

hadoop.env 配置

CORE_fs_defaultFS=hdfs://bigdata01:8082
CORE_hadoop_tmp_dir=/root/directory/hadoop/tmp
HDFS_dfs_namenode_http___address=bigdata01:9870
HDFS_dfs_namenode_secondary_http___address=bigdata01:9868
HDFS_dfs_namenode_name_dir=/root/directory/hadoop/hdfs/name
HDFS_dfs_datanode_data_dir=/root/directory/hadoop/hdfs/data
HDFS_dfs_permissions_enabled=false

YARN_yarn_nodemanager_aux___services=mapreduce_shuffle
YARN_yarn_resourcemanager_scheduler_address=bigdata01:8030
YARN_yarn_resourcemanager_resource___tracker_address=bigdata01:8031
YARN_yarn_resourcemanager_address=bigdata01:8032
YARN_yarn_resourcemanager_admin_address=bigdata01:8033
YARN_yarn_resourcemanager_webapp_address=bigdata01:8088

MAPRED_mapreduce_framework_name=yarn
MAPRED_mapreduce_jobhistory_address=bigdata01:10020

HA 高可用模式

hadoop 的HA高可用部署需要依赖Zookeeper,关于Zookeeper的集群部署在此处不赘述。HA模式下需要在集群中不同节点分别部署namenode(一般与ZKFC在同一个),datanode,journalnode,resourcemanager,nodemanager。

集群规划

NameNode 和 ZKFCDataNodeJournalNodeResourceManagerNodeManager
172.x.x.97*****
172.x.x.98*****
172.x.x.99****

compose 启动脚本

journalnode

version: "3.1"
services:
  journalnode:
    image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop:v1.5
    container_name: journalnode
    restart: always
    environment:
      - ROLE=journalnode
    volumes:
      - /home/smart/edata_docker/hdfs_journal/:/root/directory/hadoop/hdfs/journal
    env_file:
      - ./hadoop_ha.env
    network_mode: "host"

namenode 和 zkfc

version: "3.1"

services:
  namenode:
    image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop:v1.5
    container_name: namenode
    restart: always
    environment:
      - ROLE=namenode
    volumes:
      - /home/smart/edata_docker/hdfs_name/:/root/directory/hadoop/hdfs/name
      - /home/smart/edata_docker/hdfs_tmp/:/root/directory/hadoop/tmp
    env_file:
      - ./hadoop_ha.env

    network_mode: "host"

datanode

version: "3.1"

services:
  datanode:
    image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop:v1.5
    container_name: datanode
    restart: always
    environment:
      - ROLE=datanode
    volumes:
      - /home/smart/edata_docker/hdfs_data/:/root/directory/hadoop/hdfs/data
      - /home/smart/edata_docker/hdfs_tmp/:/root/directory/hadoop/tmp
    env_file:
      - ./hadoop_ha.env
    network_mode: "host"

resourcemanager

version: "3.1"

services:
  resourcemanager:
    image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop:v1.5
    container_name: resourcemanager
    restart: always
    environment:
      - ROLE=resourcemanager
    env_file:
      - ./hadoop_ha.env
    network_mode: "host"

nodemanager

version: "3.1"

services:
  nodemanager:
    image: registry.cn-shenzhen.aliyuncs.com/edata/hadoop:v1.5
    container_name: nodemanager
    restart: always
    
    environment:
      - ROLE=nodemanager
    env_file:
      - ./hadoop_ha.env
    network_mode: "host"

hadoop_ha.env 配置

CORE_fs_defaultFS=hdfs://edata
CORE_hadoop_tmp_dir=/root/directory/hadoop/tmp
CORE_ha_zookeeper_quorum=bigdata01:2181,bigdata02:2181,bigdata03:2181

HDFS_dfs_nameservices=edata
HDFS_dfs_ha_namenodes_edata=nn1,nn2
HDFS_dfs_namenode_rpc___address_edata_nn1=bigdata01:8082
HDFS_dfs_namenode_rpc___address_edata_nn2=bigdata02:8082
HDFS_dfs_namenode_http___address_edata_nn1=bigdata01:9870
HDFS_dfs_namenode_http___address_edata_nn2=bigdata02:9870
HDFS_dfs_namenode_name_dir=/root/directory/hadoop/hdfs/name
HDFS_dfs_namenode_shared_edits_dir=qjournal://bigdata01:8485;bigdata02:8485;bigdata03:8485/edata
HDFS_dfs_ha_fencing_methods=sshfencenshell(/bin/true)n
HDFS_dfs_ha_fencing_ssh_private___key___files=/root/.ssh/id_rsa
HDFS_dfs_ha_automatic___failover_enabled=true
HDFS_dfs_client_failover_proxy_provider_edata=org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
HDFS_dfs_datanode_data_dir=/root/directory/hadoop/hdfs/data
HDFS_dfs_journalnode_edits_dir=/root/directory/hadoop/hdfs/journal
HDFS_dfs_replication=3
HDFS_dfs_permissions_enabled=false

YARN_yarn_resourcemanager_ha_enabled=true
YARN_yarn_resourcemanager_cluster___id=edatayarn
YARN_yarn_resourcemanager_ha_rm___ids=rm1,rm2,rm3
YARN_yarn_resourcemanager_hostname_rm1=bigdata01
YARN_yarn_resourcemanager_hostname_rm2=bigdata02
YARN_yarn_resourcemanager_hostname_rm3=bigdata03
YARN_yarn_resourcemanager_webapp_address_rm1=bigdata01:8088
YARN_yarn_resourcemanager_webapp_address_rm2=bigdata02:8088
YARN_yarn_resourcemanager_webapp_address_rm3=bigdata03:8088
YARN_yarn_resourcemanager_recovery_enabled=true
YARN_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore
YARN_yarn_nodemanager_aux___services=mapreduce_shuffle
YARN_yarn_nodemanager_pmem___check___enabled=false
YARN_yarn_nodemanager_vmem___check___enabled=false
YARN_hadoop_zk_address=bigdata01:2181,bigdata02:2181,bigdata03:2181

MAPRED_mapreduce_framework_name=yarn
MAPRED_mapreduce_jobhistory_address=bigdata01:10020
风语者!平时喜欢研究各种技术,目前在从事后端开发工作,热爱生活、热爱工作。