Created
May 14, 2026 02:41
-
-
Save kangvcar/3109aef2020333552ac1a6332c98ce2a to your computer and use it in GitHub Desktop.
Deploy Hadoop
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # Hadoop distributed cluster teaching deployment script. | |
| # Target environment: Alibaba Cloud ECS, CentOS 7.9, root user. | |
| set -Eeuo pipefail | |
| HADOOP_VERSION="3.3.6" | |
| JAVA_ARCHIVE="openjdk-8u41-b04-linux-x64-14_jan_2020.tar.gz" | |
| JAVA_DIR_NAME="java-se-8u41-ri" | |
| HADOOP_ARCHIVE="hadoop-${HADOOP_VERSION}.tar.gz" | |
| JAVA_DEFAULT_URL="https://download.java.net/openjdk/jdk8u41/ri/${JAVA_ARCHIVE}" | |
| HADOOP_TUNA_URL="https://mirrors.tuna.tsinghua.edu.cn/apache/hadoop/common/hadoop-${HADOOP_VERSION}/${HADOOP_ARCHIVE}" | |
| HADOOP_HUAWEI_URL="https://mirrors.huaweicloud.com/apache/hadoop/core/hadoop-${HADOOP_VERSION}/${HADOOP_ARCHIVE}" | |
| HADOOP_APACHE_URL="https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/${HADOOP_ARCHIVE}" | |
| CONFIG_FILE="/etc/hadoop-teaching-deploy.conf" | |
| LOG_DIR="/var/log" | |
| SOFTWARE_DIR="/opt/software" | |
| JAVA_HOME_DIR="/opt/java8" | |
| HADOOP_HOME_DIR="/opt/hadoop" | |
| PROFILE_FILE="/etc/profile.d/hadoop-teaching.sh" | |
| SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/root/.ssh/known_hosts -o ConnectTimeout=8" | |
| HOSTS_BEGIN="# BEGIN HADOOP TEACHING CLUSTER" | |
| HOSTS_END="# END HADOOP TEACHING CLUSTER" | |
| MASTER_HOST="" | |
| MASTER_PRIVATE_IP="" | |
| MASTER_PUBLIC_IP="" | |
| ROOT_PASSWORD="" | |
| JAVA_URL="$JAVA_DEFAULT_URL" | |
| HADOOP_URL="$HADOOP_TUNA_URL" | |
| SECONDARY_HOST="" | |
| REPLICATION=1 | |
| WORKER_HOSTS=() | |
| WORKER_IPS=() | |
| log_file="" | |
| init_logging() { | |
| mkdir -p "$LOG_DIR" | |
| log_file="${LOG_DIR}/hadoop-teaching-deploy-$(date +%Y%m%d-%H%M%S).log" | |
| touch "$log_file" | |
| exec > >(tee -a "$log_file") 2>&1 | |
| } | |
| pause_for_reading() { | |
| echo | |
| read -r -p "按 Enter 继续..." _ | |
| } | |
| line() { | |
| printf '%*s\n' 78 '' | tr ' ' '-' | |
| } | |
| say() { | |
| echo "[$(date +%H:%M:%S)] $*" | |
| } | |
| section() { | |
| echo | |
| line | |
| echo "【步骤】$*" | |
| line | |
| } | |
| explain() { | |
| echo "说明:$*" | |
| } | |
| warn() { | |
| echo "提醒:$*" | |
| } | |
| die() { | |
| echo "错误:$*" >&2 | |
| exit 1 | |
| } | |
| run_cmd() { | |
| local description="$1" | |
| local command="$2" | |
| explain "$description" | |
| echo "+ $command" | |
| bash -lc "$command" | |
| } | |
| remote_cmd() { | |
| local host="$1" | |
| local description="$2" | |
| local command="$3" | |
| explain "${host}:${description}" | |
| echo "+ ssh root@${host} \"$command\"" | |
| ssh $SSH_OPTS "root@${host}" "bash -lc $(printf '%q' "$command")" | |
| } | |
| remote_cmd_password() { | |
| local host="$1" | |
| local description="$2" | |
| local command="$3" | |
| explain "${host}:${description}" | |
| echo "+ SSHPASS=****** sshpass -e ssh root@${host} \"$command\"" | |
| SSHPASS="$ROOT_PASSWORD" sshpass -e ssh $SSH_OPTS "root@${host}" "bash -lc $(printf '%q' "$command")" | |
| } | |
| copy_to_remote() { | |
| local source_path="$1" | |
| local host="$2" | |
| local target_path="$3" | |
| local description="$4" | |
| explain "$description" | |
| echo "+ rsync -az --delete ${source_path} root@${host}:${target_path}" | |
| rsync -az --delete -e "ssh $SSH_OPTS" "$source_path" "root@${host}:${target_path}" | |
| } | |
| require_root() { | |
| if [[ "$(id -u)" != "0" ]]; then | |
| die "请使用 root 用户运行:sudo bash deploy_hadoop_cluster.sh" | |
| fi | |
| } | |
| read_with_default() { | |
| local prompt="$1" | |
| local default_value="$2" | |
| local value | |
| if [[ -n "$default_value" ]]; then | |
| read -r -p "${prompt} [默认:${default_value}]:" value | |
| echo "${value:-$default_value}" | |
| else | |
| read -r -p "${prompt}:" value | |
| echo "$value" | |
| fi | |
| } | |
| read_required() { | |
| local prompt="$1" | |
| local value | |
| while true; do | |
| read -r -p "${prompt}:" value | |
| if [[ -n "$value" ]]; then | |
| echo "$value" | |
| return | |
| fi | |
| echo "该项不能为空,请重新输入。" >&2 | |
| done | |
| } | |
| read_secret_required() { | |
| local prompt="$1" | |
| local value | |
| while true; do | |
| read -r -s -p "${prompt}:" value | |
| echo >&2 | |
| if [[ -n "$value" ]]; then | |
| echo "$value" | |
| return | |
| fi | |
| echo "该项不能为空,请重新输入。" >&2 | |
| done | |
| } | |
| strip_matching_outer_quotes() { | |
| local value="$1" | |
| if [[ ${#value} -ge 2 ]]; then | |
| if [[ "${value:0:1}" == "'" && "${value: -1}" == "'" ]]; then | |
| value="${value:1:${#value}-2}" | |
| elif [[ "${value:0:1}" == '"' && "${value: -1}" == '"' ]]; then | |
| value="${value:1:${#value}-2}" | |
| fi | |
| fi | |
| echo "$value" | |
| } | |
| read_positive_int() { | |
| local prompt="$1" | |
| local min_value="$2" | |
| local value | |
| while true; do | |
| read -r -p "${prompt}:" value | |
| if [[ "$value" =~ ^[0-9]+$ ]] && (( value >= min_value )); then | |
| echo "$value" | |
| return | |
| fi | |
| echo "请输入不小于 ${min_value} 的整数。" >&2 | |
| done | |
| } | |
| choose_hadoop_url() { | |
| echo | |
| echo "请选择 Hadoop ${HADOOP_VERSION} 下载源:" | |
| echo "1) 清华大学镜像站(默认)" | |
| echo "2) 华为云镜像站" | |
| echo "3) Apache 官方归档站" | |
| echo "4) 自定义 URL" | |
| local choice | |
| read -r -p "请输入选项 [1-4,默认 1]:" choice | |
| case "${choice:-1}" in | |
| 1) HADOOP_URL="$HADOOP_TUNA_URL" ;; | |
| 2) HADOOP_URL="$HADOOP_HUAWEI_URL" ;; | |
| 3) HADOOP_URL="$HADOOP_APACHE_URL" ;; | |
| 4) HADOOP_URL="$(read_required "请输入 Hadoop tar.gz 完整下载 URL")" ;; | |
| *) echo "输入无效,使用默认清华源。"; HADOOP_URL="$HADOOP_TUNA_URL" ;; | |
| esac | |
| } | |
| collect_cluster_info() { | |
| section "收集集群规划信息" | |
| explain "这一部分对应实验报告中的“集群规划与设计”。脚本需要知道每台 ECS 的主机名和内网 IP,后续会据此写入 /etc/hosts,并让 Hadoop 使用内网通信。" | |
| local node_count worker_count default_secondary_index | |
| node_count="$(read_positive_int "请输入集群服务器总数量(包含 master,至少 2 台)" 2)" | |
| worker_count=$((node_count - 1)) | |
| MASTER_HOST="$(read_with_default "请输入 master 主机名" "master")" | |
| MASTER_PRIVATE_IP="$(read_required "请输入 ${MASTER_HOST} 的阿里云内网 IP")" | |
| MASTER_PUBLIC_IP="$(read_required "请输入 ${MASTER_HOST} 的公网 IP(用于最后生成 Web UI 地址)")" | |
| WORKER_HOSTS=() | |
| WORKER_IPS=() | |
| local i default_host worker_host worker_ip | |
| for ((i = 1; i <= worker_count; i++)); do | |
| default_host="slave${i}" | |
| worker_host="$(read_with_default "请输入第 ${i} 台 worker 主机名" "$default_host")" | |
| worker_ip="$(read_required "请输入 ${worker_host} 的阿里云内网 IP")" | |
| WORKER_HOSTS+=("$worker_host") | |
| WORKER_IPS+=("$worker_ip") | |
| done | |
| if (( worker_count >= 2 )); then | |
| default_secondary_index=2 | |
| else | |
| default_secondary_index=1 | |
| fi | |
| while true; do | |
| local index | |
| index="$(read_with_default "请选择运行 SecondaryNameNode 的 worker 序号(1-${worker_count})" "$default_secondary_index")" | |
| if [[ "$index" =~ ^[0-9]+$ ]] && (( index >= 1 && index <= worker_count )); then | |
| SECONDARY_HOST="${WORKER_HOSTS[$((index - 1))]}" | |
| break | |
| fi | |
| echo "请输入 1 到 ${worker_count} 之间的数字。" | |
| done | |
| echo "提示:这里直接输入真实密码即可,不要额外输入单引号或双引号;脚本会安全地处理密码中的 !、@、# 等特殊字符。" | |
| ROOT_PASSWORD="$(read_secret_required "请输入所有 ECS 统一 root 密码(仅用于分发 SSH 公钥,不写入配置文件)")" | |
| ROOT_PASSWORD="$(strip_matching_outer_quotes "$ROOT_PASSWORD")" | |
| JAVA_URL="$(read_with_default "请输入 Java 下载 URL" "$JAVA_DEFAULT_URL")" | |
| choose_hadoop_url | |
| if (( worker_count >= 3 )); then | |
| REPLICATION=3 | |
| else | |
| REPLICATION="$worker_count" | |
| fi | |
| echo | |
| echo "集群规划确认:" | |
| echo " master: ${MASTER_HOST} (${MASTER_PRIVATE_IP})" | |
| for ((i = 0; i < worker_count; i++)); do | |
| echo " worker: ${WORKER_HOSTS[$i]} (${WORKER_IPS[$i]})" | |
| done | |
| echo " SecondaryNameNode: ${SECONDARY_HOST}" | |
| echo " HDFS 副本数 dfs.replication: ${REPLICATION}" | |
| explain "副本数自动取 min(3, worker数量)。这样 4 台实验集群会使用 3 副本;如果只有 1 或 2 台 worker,也能正常启动,不会强行要求不存在的数据节点。" | |
| } | |
| save_config() { | |
| umask 077 | |
| { | |
| printf 'MASTER_HOST=%q\n' "$MASTER_HOST" | |
| printf 'MASTER_PRIVATE_IP=%q\n' "$MASTER_PRIVATE_IP" | |
| printf 'MASTER_PUBLIC_IP=%q\n' "$MASTER_PUBLIC_IP" | |
| printf 'JAVA_URL=%q\n' "$JAVA_URL" | |
| printf 'HADOOP_URL=%q\n' "$HADOOP_URL" | |
| printf 'SECONDARY_HOST=%q\n' "$SECONDARY_HOST" | |
| printf 'REPLICATION=%q\n' "$REPLICATION" | |
| printf 'WORKER_HOSTS=(' | |
| local item | |
| for item in "${WORKER_HOSTS[@]}"; do | |
| printf '%q ' "$item" | |
| done | |
| printf ')\n' | |
| printf 'WORKER_IPS=(' | |
| for item in "${WORKER_IPS[@]}"; do | |
| printf '%q ' "$item" | |
| done | |
| printf ')\n' | |
| } > "$CONFIG_FILE" | |
| say "已保存集群配置到 ${CONFIG_FILE}。" | |
| } | |
| load_config() { | |
| if [[ ! -f "$CONFIG_FILE" ]]; then | |
| return 1 | |
| fi | |
| # shellcheck source=/dev/null | |
| source "$CONFIG_FILE" | |
| return 0 | |
| } | |
| ensure_config_loaded_or_collect() { | |
| if load_config; then | |
| say "已读取现有集群配置:${CONFIG_FILE}" | |
| else | |
| collect_cluster_info | |
| save_config | |
| fi | |
| } | |
| all_hosts() { | |
| printf '%s\n' "$MASTER_HOST" | |
| printf '%s\n' "${WORKER_HOSTS[@]}" | |
| } | |
| worker_count() { | |
| echo "${#WORKER_HOSTS[@]}" | |
| } | |
| check_local_environment() { | |
| section "检查 master 本机基础环境" | |
| explain "Hadoop 分布式部署依赖 Linux 命令、SSH、网络和磁盘空间。先检查这些前提,可以把错误尽早暴露出来。" | |
| require_root | |
| [[ -r /etc/centos-release ]] || die "未找到 /etc/centos-release。本脚本限定 CentOS 7.9 教学环境。" | |
| local release | |
| release="$(cat /etc/centos-release)" | |
| echo "当前系统:${release}" | |
| if [[ "$release" != *"7.9"* ]]; then | |
| warn "脚本按 CentOS 7.9 编写,当前系统不是 7.9。继续执行可能需要手动排错。" | |
| fi | |
| command -v yum >/dev/null 2>&1 || die "未找到 yum。" | |
| command -v systemctl >/dev/null 2>&1 || die "未找到 systemctl。" | |
| run_cmd "查看 /opt 可用空间,确认有足够空间存放 Java、Hadoop 和 HDFS 数据。" "df -h /opt || df -h /" | |
| run_cmd "测试 DNS 能否解析外网下载域名。如果这里失败,请检查 ECS 网络或 DNS。" "getent hosts archive.apache.org >/dev/null" | |
| } | |
| install_base_tools() { | |
| section "安装部署所需基础工具" | |
| explain "curl/wget 用于下载安装包,openssh-clients/sshpass 用于分发 SSH 公钥,rsync 用于同步软件目录和配置文件。" | |
| run_cmd "安装 epel-release。sshpass 通常来自 EPEL 仓库。" "yum install -y epel-release" | |
| run_cmd "安装部署工具。" "yum install -y curl wget tar openssh-clients rsync sshpass" | |
| run_cmd "测试能否通过 HTTPS 访问外网下载源。如果这里失败,请检查 ECS 网络、安全组出站规则或 DNS。" "curl -I --connect-timeout 8 https://archive.apache.org >/dev/null" | |
| } | |
| disable_firewalld() { | |
| section "处理服务器本机防火墙" | |
| explain "课堂实验中,为了减少端口排错,本脚本关闭 firewalld。阿里云安全组仍然是公网访问 Web UI 的关键,需要在控制台开放对应端口。" | |
| if systemctl list-unit-files | grep -q '^firewalld\.service'; then | |
| run_cmd "停止 firewalld。" "systemctl stop firewalld || true" | |
| run_cmd "禁止 firewalld 开机自启。" "systemctl disable firewalld || true" | |
| else | |
| say "未检测到 firewalld 服务,跳过。" | |
| fi | |
| warn "请在阿里云安全组开放 22、9870、8020、9864、9867、9866、9868、8088、19888、10020 等端口。" | |
| } | |
| verify_password_ssh_access() { | |
| section "预检查 worker 的 SSH 密码登录" | |
| explain "在修改 worker 主机名之前,先确认 master 能用统一 root 密码登录每台 worker。这样如果密码、root 登录策略或安全组有问题,会在真正修改系统配置前停下来。" | |
| local i ip host | |
| for ((i = 0; i < $(worker_count); i++)); do | |
| ip="${WORKER_IPS[$i]}" | |
| host="${WORKER_HOSTS[$i]}" | |
| echo "+ SSHPASS=****** sshpass -e ssh root@${ip} \"echo password ssh to ${host} succeeded\"" | |
| if SSHPASS="$ROOT_PASSWORD" sshpass -e ssh $SSH_OPTS "root@${ip}" "echo password ssh to ${host} succeeded"; then | |
| say "${host} (${ip}) 密码 SSH 登录成功。" | |
| else | |
| echo | |
| echo "无法使用 root 密码登录 ${host} (${ip})。请优先检查:" | |
| echo "1. 输入的统一 root 密码是否正确。" | |
| echo "2. 目标 ECS 的 /etc/ssh/sshd_config 是否允许 PasswordAuthentication yes。" | |
| echo "3. 目标 ECS 是否允许 root 登录:PermitRootLogin yes 或 prohibit-password 以外的可用策略。" | |
| echo "4. 阿里云安全组是否允许 master 访问 worker 的 22 端口。" | |
| echo | |
| echo "如果你要手动测试包含 ! 的密码,不要直接写在交互式 Bash 命令里。推荐这样:" | |
| echo " read -r -s SSHPASS" | |
| echo " export SSHPASS" | |
| echo " sshpass -e ssh root@${ip} 'hostname'" | |
| echo | |
| echo "或者先关闭历史展开:" | |
| echo " set +H" | |
| echo " sshpass -p '你的密码' ssh root@${ip} 'hostname'" | |
| die "SSH 密码预检查失败,已停止部署。" | |
| fi | |
| done | |
| } | |
| configure_hostnames() { | |
| section "设置主机名" | |
| explain "主机名让每台服务器有稳定身份。Hadoop 配置中使用 master、slave1 这类名称,比直接写 IP 更易读,也更符合实验报告。" | |
| run_cmd "设置 master 节点主机名。" "hostnamectl set-hostname $(printf '%q' "$MASTER_HOST")" | |
| local i host | |
| for ((i = 0; i < $(worker_count); i++)); do | |
| host="${WORKER_HOSTS[$i]}" | |
| remote_cmd_password "${WORKER_IPS[$i]}" "设置 worker 主机名为 ${host}。这里先用 IP 连接,因为 /etc/hosts 还没有配置完成。" "hostnamectl set-hostname $(printf '%q' "$host")" | |
| done | |
| } | |
| hosts_block() { | |
| echo "$HOSTS_BEGIN" | |
| echo "${MASTER_PRIVATE_IP} ${MASTER_HOST}" | |
| local i | |
| for ((i = 0; i < $(worker_count); i++)); do | |
| echo "${WORKER_IPS[$i]} ${WORKER_HOSTS[$i]}" | |
| done | |
| echo "$HOSTS_END" | |
| } | |
| write_hosts_file() { | |
| local tmp_hosts="/tmp/hadoop-teaching-hosts.$$" | |
| local tmp_block="/tmp/hadoop-teaching-hosts-block.$$" | |
| hosts_block > "$tmp_block" | |
| awk -v begin="$HOSTS_BEGIN" -v end="$HOSTS_END" ' | |
| $0 == begin {skip=1; next} | |
| $0 == end {skip=0; next} | |
| skip == 0 {print} | |
| ' /etc/hosts > "$tmp_hosts" | |
| cat "$tmp_block" >> "$tmp_hosts" | |
| cp "$tmp_hosts" /etc/hosts | |
| rm -f "$tmp_hosts" "$tmp_block" | |
| } | |
| configure_hosts_files() { | |
| section "配置 /etc/hosts" | |
| explain "这里把所有节点的主机名映射到阿里云内网 IP。后续 Hadoop 节点之间会通过内网通信,速度更快,也不消耗公网流量。" | |
| write_hosts_file | |
| echo "当前 master 的 Hadoop hosts 配置块:" | |
| sed -n "/${HOSTS_BEGIN}/,/${HOSTS_END}/p" /etc/hosts | |
| local i host ip | |
| for ((i = 0; i < $(worker_count); i++)); do | |
| host="${WORKER_HOSTS[$i]}" | |
| ip="${WORKER_IPS[$i]}" | |
| explain "同步 /etc/hosts 到 ${host}。这里继续使用 IP,避免 hosts 文件尚未生效时解析失败。" | |
| echo "+ SSHPASS=****** sshpass -e scp /etc/hosts root@${ip}:/etc/hosts" | |
| SSHPASS="$ROOT_PASSWORD" sshpass -e scp $SSH_OPTS /etc/hosts "root@${ip}:/etc/hosts" | |
| done | |
| run_cmd "在 master 上验证主机名解析。" "for h in $(printf '%q ' "$MASTER_HOST" "${WORKER_HOSTS[@]}"); do getent hosts \"\$h\"; done" | |
| } | |
| setup_ssh_key() { | |
| section "配置 SSH 免密登录" | |
| explain "Hadoop 的 start-dfs.sh 和 start-yarn.sh 会从 master 通过 SSH 到各 worker 启动服务,所以 master 必须能免密登录所有节点。" | |
| mkdir -p /root/.ssh | |
| chmod 700 /root/.ssh | |
| if [[ ! -f /root/.ssh/id_rsa ]]; then | |
| run_cmd "生成 master 节点 SSH 密钥。-N '' 表示不设置私钥口令,便于 Hadoop 自动启动远程服务。" "ssh-keygen -t rsa -b 4096 -N '' -f /root/.ssh/id_rsa" | |
| else | |
| say "检测到 /root/.ssh/id_rsa 已存在,跳过生成密钥。" | |
| fi | |
| local target | |
| for target in "$MASTER_HOST" "${WORKER_HOSTS[@]}"; do | |
| if [[ "$target" == "$MASTER_HOST" ]]; then | |
| explain "把 master 的公钥加入本机 authorized_keys,保证 master 也能免密 SSH 到自己。" | |
| echo "+ install local public key into /root/.ssh/authorized_keys" | |
| touch /root/.ssh/authorized_keys | |
| chmod 600 /root/.ssh/authorized_keys | |
| grep -qxF "$(cat /root/.ssh/id_rsa.pub)" /root/.ssh/authorized_keys || cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys | |
| else | |
| explain "把 master 的公钥分发给 ${target},后续即可免密登录。" | |
| echo "+ SSHPASS=****** sshpass -e ssh-copy-id root@${target}" | |
| SSHPASS="$ROOT_PASSWORD" sshpass -e ssh-copy-id $SSH_OPTS "root@${target}" >/dev/null | |
| fi | |
| done | |
| section "验证 SSH 免密登录" | |
| for target in "$MASTER_HOST" "${WORKER_HOSTS[@]}"; do | |
| remote_cmd "$target" "验证 master 到 ${target} 是否无需密码即可执行命令。" "echo SSH to ${target} succeeded" | |
| done | |
| } | |
| stop_hadoop_services_best_effort() { | |
| section "停止已有 Hadoop 服务" | |
| explain "全新覆盖部署前先尝试停止旧服务,避免清理目录时仍有进程占用文件。即使当前没有安装 Hadoop,这一步失败也不会中断。" | |
| if [[ -x "${HADOOP_HOME_DIR}/sbin/stop-yarn.sh" || -x "${HADOOP_HOME_DIR}/sbin/stop-dfs.sh" ]]; then | |
| bash -lc "source ${PROFILE_FILE} 2>/dev/null || true; mapred --daemon stop historyserver 2>/dev/null || true; stop-yarn.sh 2>/dev/null || true; stop-dfs.sh 2>/dev/null || true" || true | |
| else | |
| say "未检测到可用的 Hadoop 停止脚本,跳过。" | |
| fi | |
| } | |
| cleanup_old_installation() { | |
| section "清理旧安装和旧 HDFS 数据" | |
| explain "这是教学环境的全新覆盖安装策略。会删除 /opt/java8、/opt/hadoop 以及旧安装包缓存;NameNode 和 DataNode 的旧数据也会被清理。" | |
| local cleanup_cmd | |
| cleanup_cmd="rm -rf ${JAVA_HOME_DIR} ${HADOOP_HOME_DIR} ${SOFTWARE_DIR:?}/${JAVA_ARCHIVE} ${SOFTWARE_DIR:?}/${HADOOP_ARCHIVE} ${PROFILE_FILE}" | |
| run_cmd "清理 master 上的旧目录和安装包。" "mkdir -p ${SOFTWARE_DIR}; ${cleanup_cmd}" | |
| local host | |
| for host in "${WORKER_HOSTS[@]}"; do | |
| remote_cmd "$host" "清理 ${host} 上的旧目录和安装包。" "mkdir -p ${SOFTWARE_DIR}; ${cleanup_cmd}" | |
| done | |
| } | |
| download_with_resume() { | |
| local url="$1" | |
| local output="$2" | |
| local description="$3" | |
| explain "$description" | |
| echo "+ curl -L --fail --retry 3 -C - -o ${output} ${url}" | |
| curl -L --fail --retry 3 -C - -o "$output" "$url" | |
| } | |
| install_java_and_hadoop() { | |
| section "下载并安装 Java 与 Hadoop" | |
| explain "Hadoop 使用 Java 编写,所以必须先安装 JDK。这里沿用实验报告中的 OpenJDK 8 与 Hadoop 3.3.6。" | |
| mkdir -p "$SOFTWARE_DIR" | |
| download_with_resume "$JAVA_URL" "${SOFTWARE_DIR}/${JAVA_ARCHIVE}" "下载 OpenJDK 8 安装包。" | |
| download_with_resume "$HADOOP_URL" "${SOFTWARE_DIR}/${HADOOP_ARCHIVE}" "下载 Hadoop ${HADOOP_VERSION} 安装包。" | |
| run_cmd "解压 OpenJDK 到 /opt,并重命名为 /opt/java8。" "tar -zxf ${SOFTWARE_DIR}/${JAVA_ARCHIVE} -C /opt && mv /opt/${JAVA_DIR_NAME} ${JAVA_HOME_DIR}" | |
| run_cmd "解压 Hadoop 到 /opt,并重命名为 /opt/hadoop。" "tar -zxf ${SOFTWARE_DIR}/${HADOOP_ARCHIVE} -C /opt && mv /opt/hadoop-${HADOOP_VERSION} ${HADOOP_HOME_DIR}" | |
| } | |
| write_profile() { | |
| section "配置环境变量" | |
| explain "把 Java 和 Hadoop 命令加入 PATH 后,学生可以在任意目录执行 java、hadoop、hdfs、start-dfs.sh 等命令。这里写入 /etc/profile.d 独立文件,避免修改整份 /etc/profile。" | |
| cat > "$PROFILE_FILE" <<EOF_PROFILE | |
| export JAVA_HOME=${JAVA_HOME_DIR} | |
| export HADOOP_HOME=${HADOOP_HOME_DIR} | |
| export PATH=\$PATH:\$JAVA_HOME/bin:\$HADOOP_HOME/bin:\$HADOOP_HOME/sbin | |
| EOF_PROFILE | |
| chmod 0644 "$PROFILE_FILE" | |
| run_cmd "验证 Java 版本。" "source ${PROFILE_FILE}; java -version" | |
| run_cmd "验证 Hadoop 版本。" "source ${PROFILE_FILE}; hadoop version" | |
| } | |
| write_xml_config() { | |
| local file="$1" | |
| shift | |
| { | |
| echo '<?xml version="1.0" encoding="UTF-8"?>' | |
| echo '<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>' | |
| echo '<configuration>' | |
| while (( "$#" )); do | |
| local name="$1" | |
| local value="$2" | |
| local comment="$3" | |
| shift 3 | |
| echo " <!-- ${comment} -->" | |
| echo " <property>" | |
| echo " <name>${name}</name>" | |
| echo " <value>${value}</value>" | |
| echo " </property>" | |
| done | |
| echo '</configuration>' | |
| } > "$file" | |
| } | |
| configure_hadoop_files() { | |
| section "生成 Hadoop 分布式配置文件" | |
| explain "这些 XML 文件决定 HDFS、YARN、MapReduce 的角色位置和运行参数。脚本根据前面输入的主机名自动生成配置。" | |
| local conf_dir="${HADOOP_HOME_DIR}/etc/hadoop" | |
| write_xml_config "${conf_dir}/core-site.xml" \ | |
| "fs.defaultFS" "hdfs://${MASTER_HOST}:8020" "指定 HDFS 文件系统主节点地址和端口。" \ | |
| "hadoop.tmp.dir" "${HADOOP_HOME_DIR}/hdfs/tmp" "指定 Hadoop 临时文件存储目录。" \ | |
| "hadoop.proxyuser.root.hosts" "*" "允许 root 用户通过代理访问任意主机,便于教学实验。" \ | |
| "hadoop.proxyuser.root.groups" "*" "允许 root 用户通过代理访问任意用户组,便于教学实验。" | |
| write_xml_config "${conf_dir}/hdfs-site.xml" \ | |
| "dfs.replication" "$REPLICATION" "设置 HDFS 数据块副本数。" \ | |
| "dfs.namenode.name.dir" "${HADOOP_HOME_DIR}/hdfs/name" "指定 NameNode 元数据存储目录。" \ | |
| "dfs.datanode.data.dir" "${HADOOP_HOME_DIR}/hdfs/data" "指定 DataNode 数据块存储目录。" \ | |
| "dfs.namenode.http-address" "${MASTER_HOST}:9870" "设置 NameNode Web 管理界面地址。" \ | |
| "dfs.namenode.secondary.http-address" "${SECONDARY_HOST}:9868" "设置 SecondaryNameNode Web 管理界面地址。" \ | |
| "dfs.permissions" "false" "关闭 HDFS 权限检查,降低教学实验复杂度。" | |
| write_xml_config "${conf_dir}/yarn-site.xml" \ | |
| "yarn.resourcemanager.hostname" "$MASTER_HOST" "指定 ResourceManager 所在节点。" \ | |
| "yarn.nodemanager.aux-services" "mapreduce_shuffle" "启用 MapReduce shuffle 服务。" \ | |
| "yarn.log.server.url" "http://${MASTER_HOST}:19888/jobhistory/logs" "设置历史日志服务器地址。" \ | |
| "yarn.log-aggregation-enable" "true" "开启 YARN 日志聚集。" | |
| write_xml_config "${conf_dir}/mapred-site.xml" \ | |
| "mapreduce.framework.name" "yarn" "指定 MapReduce 程序运行在 YARN 上。" \ | |
| "mapreduce.jobhistory.address" "${MASTER_HOST}:10020" "设置 JobHistoryServer RPC 地址。" \ | |
| "mapreduce.jobhistory.webapp.address" "${MASTER_HOST}:19888" "设置 JobHistoryServer Web 地址。" \ | |
| "yarn.app.mapreduce.am.env" "HADOOP_MAPRED_HOME=\${HADOOP_HOME}" "设置 MapReduce ApplicationMaster 环境变量。" \ | |
| "mapreduce.map.env" "HADOOP_MAPRED_HOME=\${HADOOP_HOME}" "设置 Map 任务环境变量。" \ | |
| "mapreduce.reduce.env" "HADOOP_MAPRED_HOME=\${HADOOP_HOME}" "设置 Reduce 任务环境变量。" | |
| { | |
| echo "export JAVA_HOME=${JAVA_HOME_DIR}" | |
| echo "export HADOOP_HOME=${HADOOP_HOME_DIR}" | |
| echo "export HDFS_NAMENODE_USER=root" | |
| echo "export HDFS_DATANODE_USER=root" | |
| echo "export HDFS_SECONDARYNAMENODE_USER=root" | |
| echo "export YARN_RESOURCEMANAGER_USER=root" | |
| echo "export YARN_NODEMANAGER_USER=root" | |
| } >> "${conf_dir}/hadoop-env.sh" | |
| printf '%s\n' "${WORKER_HOSTS[@]}" > "${conf_dir}/workers" | |
| echo "workers 文件内容:" | |
| cat "${conf_dir}/workers" | |
| run_cmd "在 master 创建 NameNode 元数据目录。" "mkdir -p ${HADOOP_HOME_DIR}/hdfs/name ${HADOOP_HOME_DIR}/hdfs/tmp" | |
| } | |
| create_worker_hdfs_dirs() { | |
| section "创建 worker HDFS 数据目录" | |
| explain "DataNode 会把 HDFS 数据块存放在各 worker 的数据目录中。软件同步完成后再创建目录,可以避免 rsync 覆盖安装时把空目录清掉。" | |
| local host | |
| for host in "${WORKER_HOSTS[@]}"; do | |
| remote_cmd "$host" "创建 DataNode 数据目录。" "mkdir -p ${HADOOP_HOME_DIR}/hdfs/data ${HADOOP_HOME_DIR}/hdfs/tmp" | |
| done | |
| } | |
| sync_installation_to_workers() { | |
| section "同步 Java、Hadoop 和环境变量到 worker" | |
| explain "为了保证集群环境一致,master 安装并配置完成后,把相同目录同步到所有 worker。" | |
| local host | |
| for host in "${WORKER_HOSTS[@]}"; do | |
| copy_to_remote "${JAVA_HOME_DIR}/" "$host" "${JAVA_HOME_DIR}/" "同步 Java 到 ${host}。" | |
| copy_to_remote "${HADOOP_HOME_DIR}/" "$host" "${HADOOP_HOME_DIR}/" "同步 Hadoop 到 ${host}。" | |
| explain "同步环境变量文件到 ${host}。" | |
| echo "+ scp ${PROFILE_FILE} root@${host}:${PROFILE_FILE}" | |
| scp $SSH_OPTS "$PROFILE_FILE" "root@${host}:${PROFILE_FILE}" | |
| remote_cmd "$host" "验证 Java 与 Hadoop 命令可用。" "source ${PROFILE_FILE}; java -version; hadoop version | head -n 1" | |
| done | |
| } | |
| format_namenode() { | |
| section "格式化 NameNode" | |
| explain "格式化 NameNode 会初始化 HDFS 命名空间。全新部署时必须执行;重复格式化会清空 HDFS 元数据,所以本脚本只在全新覆盖流程中执行。" | |
| run_cmd "执行 NameNode 格式化。" "source ${PROFILE_FILE}; hdfs namenode -format -force" | |
| } | |
| start_cluster() { | |
| ensure_config_loaded_or_collect | |
| section "启动 Hadoop 分布式集群" | |
| explain "启动顺序为 HDFS -> YARN -> JobHistoryServer。HDFS 提供分布式存储,YARN 提供资源调度,历史服务器记录 MapReduce 作业历史。" | |
| run_cmd "启动 HDFS。" "source ${PROFILE_FILE}; start-dfs.sh" | |
| run_cmd "启动 YARN。" "source ${PROFILE_FILE}; start-yarn.sh" | |
| run_cmd "启动 MapReduce JobHistoryServer。" "source ${PROFILE_FILE}; mapred --daemon start historyserver" | |
| check_status | |
| print_web_urls | |
| } | |
| stop_cluster() { | |
| ensure_config_loaded_or_collect | |
| section "停止 Hadoop 分布式集群" | |
| explain "停止顺序为 JobHistoryServer -> YARN -> HDFS。先停上层计算服务,再停底层存储服务,能减少异常关闭带来的干扰。" | |
| run_cmd "停止 MapReduce JobHistoryServer。" "source ${PROFILE_FILE}; mapred --daemon stop historyserver || true" | |
| run_cmd "停止 YARN。" "source ${PROFILE_FILE}; stop-yarn.sh || true" | |
| run_cmd "停止 HDFS。" "source ${PROFILE_FILE}; stop-dfs.sh || true" | |
| check_jps_all_nodes | |
| } | |
| check_jps_all_nodes() { | |
| section "检查各节点 Java 进程" | |
| explain "jps 会列出当前节点上的 Java 进程。NameNode、DataNode、ResourceManager、NodeManager 等都应该能在对应节点看到。" | |
| run_cmd "查看 master 上的进程。" "source ${PROFILE_FILE} 2>/dev/null || true; jps || true" | |
| local host | |
| for host in "${WORKER_HOSTS[@]}"; do | |
| remote_cmd "$host" "查看 ${host} 上的进程。" "source ${PROFILE_FILE} 2>/dev/null || true; jps || true" | |
| done | |
| } | |
| check_status() { | |
| ensure_config_loaded_or_collect | |
| check_jps_all_nodes | |
| section "检查 HDFS 和 YARN 节点状态" | |
| explain "dfsadmin -report 用来查看 DataNode 是否在线;yarn node -list 用来查看 NodeManager 是否加入集群。" | |
| run_cmd "查看 HDFS DataNode 报告。" "source ${PROFILE_FILE}; hdfs dfsadmin -report || true" | |
| run_cmd "查看 YARN NodeManager 列表。" "source ${PROFILE_FILE}; yarn node -list || true" | |
| } | |
| print_web_urls() { | |
| section "Web 管理界面地址" | |
| explain "这些页面用于观察集群状态。若公网无法访问,请优先检查阿里云安全组端口是否开放。" | |
| echo "HDFS NameNode: http://${MASTER_PUBLIC_IP}:9870/" | |
| echo "YARN ResourceManager: http://${MASTER_PUBLIC_IP}:8088/" | |
| echo "JobHistoryServer: http://${MASTER_PUBLIC_IP}:19888/" | |
| } | |
| full_deploy() { | |
| collect_cluster_info | |
| save_config | |
| check_local_environment | |
| install_base_tools | |
| disable_firewalld | |
| verify_password_ssh_access | |
| configure_hostnames | |
| configure_hosts_files | |
| setup_ssh_key | |
| stop_hadoop_services_best_effort | |
| cleanup_old_installation | |
| install_java_and_hadoop | |
| write_profile | |
| configure_hadoop_files | |
| sync_installation_to_workers | |
| create_worker_hdfs_dirs | |
| format_namenode | |
| start_cluster | |
| } | |
| clean_reinstall() { | |
| section "清理重装" | |
| explain "清理重装会重新收集集群信息、停止服务、删除旧目录,然后完整执行一次全新部署。" | |
| full_deploy | |
| } | |
| show_menu() { | |
| echo | |
| line | |
| echo "Hadoop 分布式集群自动部署教学脚本" | |
| line | |
| echo "1) 全新部署" | |
| echo "2) 启动集群" | |
| echo "3) 停止集群" | |
| echo "4) 状态检查" | |
| echo "5) 清理重装" | |
| echo "0) 退出" | |
| } | |
| main() { | |
| require_root | |
| init_logging | |
| say "日志文件:${log_file}" | |
| warn "本脚本面向教学实验环境,会在全新部署/清理重装时删除旧 Hadoop、Java 和 HDFS 数据目录。" | |
| while true; do | |
| show_menu | |
| local choice | |
| read -r -p "请选择操作:" choice | |
| case "$choice" in | |
| 1) full_deploy; pause_for_reading ;; | |
| 2) start_cluster; pause_for_reading ;; | |
| 3) stop_cluster; pause_for_reading ;; | |
| 4) check_status; print_web_urls; pause_for_reading ;; | |
| 5) clean_reinstall; pause_for_reading ;; | |
| 0) say "退出脚本。"; break ;; | |
| *) echo "无效选项,请重新输入。" ;; | |
| esac | |
| done | |
| } | |
| main "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment