Skip to content

Instantly share code, notes, and snippets.

@kangvcar
Created May 14, 2026 02:41
Show Gist options
  • Select an option

  • Save kangvcar/3109aef2020333552ac1a6332c98ce2a to your computer and use it in GitHub Desktop.

Select an option

Save kangvcar/3109aef2020333552ac1a6332c98ce2a to your computer and use it in GitHub Desktop.
Deploy Hadoop
#!/usr/bin/env bash
# Hadoop distributed cluster teaching deployment script.
# Target environment: Alibaba Cloud ECS, CentOS 7.9, root user.
set -Eeuo pipefail
HADOOP_VERSION="3.3.6"
JAVA_ARCHIVE="openjdk-8u41-b04-linux-x64-14_jan_2020.tar.gz"
JAVA_DIR_NAME="java-se-8u41-ri"
HADOOP_ARCHIVE="hadoop-${HADOOP_VERSION}.tar.gz"
JAVA_DEFAULT_URL="https://download.java.net/openjdk/jdk8u41/ri/${JAVA_ARCHIVE}"
HADOOP_TUNA_URL="https://mirrors.tuna.tsinghua.edu.cn/apache/hadoop/common/hadoop-${HADOOP_VERSION}/${HADOOP_ARCHIVE}"
HADOOP_HUAWEI_URL="https://mirrors.huaweicloud.com/apache/hadoop/core/hadoop-${HADOOP_VERSION}/${HADOOP_ARCHIVE}"
HADOOP_APACHE_URL="https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/${HADOOP_ARCHIVE}"
CONFIG_FILE="/etc/hadoop-teaching-deploy.conf"
LOG_DIR="/var/log"
SOFTWARE_DIR="/opt/software"
JAVA_HOME_DIR="/opt/java8"
HADOOP_HOME_DIR="/opt/hadoop"
PROFILE_FILE="/etc/profile.d/hadoop-teaching.sh"
SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/root/.ssh/known_hosts -o ConnectTimeout=8"
HOSTS_BEGIN="# BEGIN HADOOP TEACHING CLUSTER"
HOSTS_END="# END HADOOP TEACHING CLUSTER"
MASTER_HOST=""
MASTER_PRIVATE_IP=""
MASTER_PUBLIC_IP=""
ROOT_PASSWORD=""
JAVA_URL="$JAVA_DEFAULT_URL"
HADOOP_URL="$HADOOP_TUNA_URL"
SECONDARY_HOST=""
REPLICATION=1
WORKER_HOSTS=()
WORKER_IPS=()
log_file=""
init_logging() {
mkdir -p "$LOG_DIR"
log_file="${LOG_DIR}/hadoop-teaching-deploy-$(date +%Y%m%d-%H%M%S).log"
touch "$log_file"
exec > >(tee -a "$log_file") 2>&1
}
pause_for_reading() {
echo
read -r -p "按 Enter 继续..." _
}
line() {
printf '%*s\n' 78 '' | tr ' ' '-'
}
say() {
echo "[$(date +%H:%M:%S)] $*"
}
section() {
echo
line
echo "【步骤】$*"
line
}
explain() {
echo "说明:$*"
}
warn() {
echo "提醒:$*"
}
die() {
echo "错误:$*" >&2
exit 1
}
run_cmd() {
local description="$1"
local command="$2"
explain "$description"
echo "+ $command"
bash -lc "$command"
}
remote_cmd() {
local host="$1"
local description="$2"
local command="$3"
explain "${host}:${description}"
echo "+ ssh root@${host} \"$command\""
ssh $SSH_OPTS "root@${host}" "bash -lc $(printf '%q' "$command")"
}
remote_cmd_password() {
local host="$1"
local description="$2"
local command="$3"
explain "${host}:${description}"
echo "+ SSHPASS=****** sshpass -e ssh root@${host} \"$command\""
SSHPASS="$ROOT_PASSWORD" sshpass -e ssh $SSH_OPTS "root@${host}" "bash -lc $(printf '%q' "$command")"
}
copy_to_remote() {
local source_path="$1"
local host="$2"
local target_path="$3"
local description="$4"
explain "$description"
echo "+ rsync -az --delete ${source_path} root@${host}:${target_path}"
rsync -az --delete -e "ssh $SSH_OPTS" "$source_path" "root@${host}:${target_path}"
}
require_root() {
if [[ "$(id -u)" != "0" ]]; then
die "请使用 root 用户运行:sudo bash deploy_hadoop_cluster.sh"
fi
}
read_with_default() {
local prompt="$1"
local default_value="$2"
local value
if [[ -n "$default_value" ]]; then
read -r -p "${prompt} [默认:${default_value}]:" value
echo "${value:-$default_value}"
else
read -r -p "${prompt}:" value
echo "$value"
fi
}
read_required() {
local prompt="$1"
local value
while true; do
read -r -p "${prompt}:" value
if [[ -n "$value" ]]; then
echo "$value"
return
fi
echo "该项不能为空,请重新输入。" >&2
done
}
read_secret_required() {
local prompt="$1"
local value
while true; do
read -r -s -p "${prompt}:" value
echo >&2
if [[ -n "$value" ]]; then
echo "$value"
return
fi
echo "该项不能为空,请重新输入。" >&2
done
}
strip_matching_outer_quotes() {
local value="$1"
if [[ ${#value} -ge 2 ]]; then
if [[ "${value:0:1}" == "'" && "${value: -1}" == "'" ]]; then
value="${value:1:${#value}-2}"
elif [[ "${value:0:1}" == '"' && "${value: -1}" == '"' ]]; then
value="${value:1:${#value}-2}"
fi
fi
echo "$value"
}
read_positive_int() {
local prompt="$1"
local min_value="$2"
local value
while true; do
read -r -p "${prompt}:" value
if [[ "$value" =~ ^[0-9]+$ ]] && (( value >= min_value )); then
echo "$value"
return
fi
echo "请输入不小于 ${min_value} 的整数。" >&2
done
}
choose_hadoop_url() {
echo
echo "请选择 Hadoop ${HADOOP_VERSION} 下载源:"
echo "1) 清华大学镜像站(默认)"
echo "2) 华为云镜像站"
echo "3) Apache 官方归档站"
echo "4) 自定义 URL"
local choice
read -r -p "请输入选项 [1-4,默认 1]:" choice
case "${choice:-1}" in
1) HADOOP_URL="$HADOOP_TUNA_URL" ;;
2) HADOOP_URL="$HADOOP_HUAWEI_URL" ;;
3) HADOOP_URL="$HADOOP_APACHE_URL" ;;
4) HADOOP_URL="$(read_required "请输入 Hadoop tar.gz 完整下载 URL")" ;;
*) echo "输入无效,使用默认清华源。"; HADOOP_URL="$HADOOP_TUNA_URL" ;;
esac
}
collect_cluster_info() {
section "收集集群规划信息"
explain "这一部分对应实验报告中的“集群规划与设计”。脚本需要知道每台 ECS 的主机名和内网 IP,后续会据此写入 /etc/hosts,并让 Hadoop 使用内网通信。"
local node_count worker_count default_secondary_index
node_count="$(read_positive_int "请输入集群服务器总数量(包含 master,至少 2 台)" 2)"
worker_count=$((node_count - 1))
MASTER_HOST="$(read_with_default "请输入 master 主机名" "master")"
MASTER_PRIVATE_IP="$(read_required "请输入 ${MASTER_HOST} 的阿里云内网 IP")"
MASTER_PUBLIC_IP="$(read_required "请输入 ${MASTER_HOST} 的公网 IP(用于最后生成 Web UI 地址)")"
WORKER_HOSTS=()
WORKER_IPS=()
local i default_host worker_host worker_ip
for ((i = 1; i <= worker_count; i++)); do
default_host="slave${i}"
worker_host="$(read_with_default "请输入第 ${i} 台 worker 主机名" "$default_host")"
worker_ip="$(read_required "请输入 ${worker_host} 的阿里云内网 IP")"
WORKER_HOSTS+=("$worker_host")
WORKER_IPS+=("$worker_ip")
done
if (( worker_count >= 2 )); then
default_secondary_index=2
else
default_secondary_index=1
fi
while true; do
local index
index="$(read_with_default "请选择运行 SecondaryNameNode 的 worker 序号(1-${worker_count})" "$default_secondary_index")"
if [[ "$index" =~ ^[0-9]+$ ]] && (( index >= 1 && index <= worker_count )); then
SECONDARY_HOST="${WORKER_HOSTS[$((index - 1))]}"
break
fi
echo "请输入 1 到 ${worker_count} 之间的数字。"
done
echo "提示:这里直接输入真实密码即可,不要额外输入单引号或双引号;脚本会安全地处理密码中的 !、@、# 等特殊字符。"
ROOT_PASSWORD="$(read_secret_required "请输入所有 ECS 统一 root 密码(仅用于分发 SSH 公钥,不写入配置文件)")"
ROOT_PASSWORD="$(strip_matching_outer_quotes "$ROOT_PASSWORD")"
JAVA_URL="$(read_with_default "请输入 Java 下载 URL" "$JAVA_DEFAULT_URL")"
choose_hadoop_url
if (( worker_count >= 3 )); then
REPLICATION=3
else
REPLICATION="$worker_count"
fi
echo
echo "集群规划确认:"
echo " master: ${MASTER_HOST} (${MASTER_PRIVATE_IP})"
for ((i = 0; i < worker_count; i++)); do
echo " worker: ${WORKER_HOSTS[$i]} (${WORKER_IPS[$i]})"
done
echo " SecondaryNameNode: ${SECONDARY_HOST}"
echo " HDFS 副本数 dfs.replication: ${REPLICATION}"
explain "副本数自动取 min(3, worker数量)。这样 4 台实验集群会使用 3 副本;如果只有 1 或 2 台 worker,也能正常启动,不会强行要求不存在的数据节点。"
}
save_config() {
umask 077
{
printf 'MASTER_HOST=%q\n' "$MASTER_HOST"
printf 'MASTER_PRIVATE_IP=%q\n' "$MASTER_PRIVATE_IP"
printf 'MASTER_PUBLIC_IP=%q\n' "$MASTER_PUBLIC_IP"
printf 'JAVA_URL=%q\n' "$JAVA_URL"
printf 'HADOOP_URL=%q\n' "$HADOOP_URL"
printf 'SECONDARY_HOST=%q\n' "$SECONDARY_HOST"
printf 'REPLICATION=%q\n' "$REPLICATION"
printf 'WORKER_HOSTS=('
local item
for item in "${WORKER_HOSTS[@]}"; do
printf '%q ' "$item"
done
printf ')\n'
printf 'WORKER_IPS=('
for item in "${WORKER_IPS[@]}"; do
printf '%q ' "$item"
done
printf ')\n'
} > "$CONFIG_FILE"
say "已保存集群配置到 ${CONFIG_FILE}。"
}
load_config() {
if [[ ! -f "$CONFIG_FILE" ]]; then
return 1
fi
# shellcheck source=/dev/null
source "$CONFIG_FILE"
return 0
}
ensure_config_loaded_or_collect() {
if load_config; then
say "已读取现有集群配置:${CONFIG_FILE}"
else
collect_cluster_info
save_config
fi
}
all_hosts() {
printf '%s\n' "$MASTER_HOST"
printf '%s\n' "${WORKER_HOSTS[@]}"
}
worker_count() {
echo "${#WORKER_HOSTS[@]}"
}
check_local_environment() {
section "检查 master 本机基础环境"
explain "Hadoop 分布式部署依赖 Linux 命令、SSH、网络和磁盘空间。先检查这些前提,可以把错误尽早暴露出来。"
require_root
[[ -r /etc/centos-release ]] || die "未找到 /etc/centos-release。本脚本限定 CentOS 7.9 教学环境。"
local release
release="$(cat /etc/centos-release)"
echo "当前系统:${release}"
if [[ "$release" != *"7.9"* ]]; then
warn "脚本按 CentOS 7.9 编写,当前系统不是 7.9。继续执行可能需要手动排错。"
fi
command -v yum >/dev/null 2>&1 || die "未找到 yum。"
command -v systemctl >/dev/null 2>&1 || die "未找到 systemctl。"
run_cmd "查看 /opt 可用空间,确认有足够空间存放 Java、Hadoop 和 HDFS 数据。" "df -h /opt || df -h /"
run_cmd "测试 DNS 能否解析外网下载域名。如果这里失败,请检查 ECS 网络或 DNS。" "getent hosts archive.apache.org >/dev/null"
}
install_base_tools() {
section "安装部署所需基础工具"
explain "curl/wget 用于下载安装包,openssh-clients/sshpass 用于分发 SSH 公钥,rsync 用于同步软件目录和配置文件。"
run_cmd "安装 epel-release。sshpass 通常来自 EPEL 仓库。" "yum install -y epel-release"
run_cmd "安装部署工具。" "yum install -y curl wget tar openssh-clients rsync sshpass"
run_cmd "测试能否通过 HTTPS 访问外网下载源。如果这里失败,请检查 ECS 网络、安全组出站规则或 DNS。" "curl -I --connect-timeout 8 https://archive.apache.org >/dev/null"
}
disable_firewalld() {
section "处理服务器本机防火墙"
explain "课堂实验中,为了减少端口排错,本脚本关闭 firewalld。阿里云安全组仍然是公网访问 Web UI 的关键,需要在控制台开放对应端口。"
if systemctl list-unit-files | grep -q '^firewalld\.service'; then
run_cmd "停止 firewalld。" "systemctl stop firewalld || true"
run_cmd "禁止 firewalld 开机自启。" "systemctl disable firewalld || true"
else
say "未检测到 firewalld 服务,跳过。"
fi
warn "请在阿里云安全组开放 22、9870、8020、9864、9867、9866、9868、8088、19888、10020 等端口。"
}
verify_password_ssh_access() {
section "预检查 worker 的 SSH 密码登录"
explain "在修改 worker 主机名之前,先确认 master 能用统一 root 密码登录每台 worker。这样如果密码、root 登录策略或安全组有问题,会在真正修改系统配置前停下来。"
local i ip host
for ((i = 0; i < $(worker_count); i++)); do
ip="${WORKER_IPS[$i]}"
host="${WORKER_HOSTS[$i]}"
echo "+ SSHPASS=****** sshpass -e ssh root@${ip} \"echo password ssh to ${host} succeeded\""
if SSHPASS="$ROOT_PASSWORD" sshpass -e ssh $SSH_OPTS "root@${ip}" "echo password ssh to ${host} succeeded"; then
say "${host} (${ip}) 密码 SSH 登录成功。"
else
echo
echo "无法使用 root 密码登录 ${host} (${ip})。请优先检查:"
echo "1. 输入的统一 root 密码是否正确。"
echo "2. 目标 ECS 的 /etc/ssh/sshd_config 是否允许 PasswordAuthentication yes。"
echo "3. 目标 ECS 是否允许 root 登录:PermitRootLogin yes 或 prohibit-password 以外的可用策略。"
echo "4. 阿里云安全组是否允许 master 访问 worker 的 22 端口。"
echo
echo "如果你要手动测试包含 ! 的密码,不要直接写在交互式 Bash 命令里。推荐这样:"
echo " read -r -s SSHPASS"
echo " export SSHPASS"
echo " sshpass -e ssh root@${ip} 'hostname'"
echo
echo "或者先关闭历史展开:"
echo " set +H"
echo " sshpass -p '你的密码' ssh root@${ip} 'hostname'"
die "SSH 密码预检查失败,已停止部署。"
fi
done
}
configure_hostnames() {
section "设置主机名"
explain "主机名让每台服务器有稳定身份。Hadoop 配置中使用 master、slave1 这类名称,比直接写 IP 更易读,也更符合实验报告。"
run_cmd "设置 master 节点主机名。" "hostnamectl set-hostname $(printf '%q' "$MASTER_HOST")"
local i host
for ((i = 0; i < $(worker_count); i++)); do
host="${WORKER_HOSTS[$i]}"
remote_cmd_password "${WORKER_IPS[$i]}" "设置 worker 主机名为 ${host}。这里先用 IP 连接,因为 /etc/hosts 还没有配置完成。" "hostnamectl set-hostname $(printf '%q' "$host")"
done
}
hosts_block() {
echo "$HOSTS_BEGIN"
echo "${MASTER_PRIVATE_IP} ${MASTER_HOST}"
local i
for ((i = 0; i < $(worker_count); i++)); do
echo "${WORKER_IPS[$i]} ${WORKER_HOSTS[$i]}"
done
echo "$HOSTS_END"
}
write_hosts_file() {
local tmp_hosts="/tmp/hadoop-teaching-hosts.$$"
local tmp_block="/tmp/hadoop-teaching-hosts-block.$$"
hosts_block > "$tmp_block"
awk -v begin="$HOSTS_BEGIN" -v end="$HOSTS_END" '
$0 == begin {skip=1; next}
$0 == end {skip=0; next}
skip == 0 {print}
' /etc/hosts > "$tmp_hosts"
cat "$tmp_block" >> "$tmp_hosts"
cp "$tmp_hosts" /etc/hosts
rm -f "$tmp_hosts" "$tmp_block"
}
configure_hosts_files() {
section "配置 /etc/hosts"
explain "这里把所有节点的主机名映射到阿里云内网 IP。后续 Hadoop 节点之间会通过内网通信,速度更快,也不消耗公网流量。"
write_hosts_file
echo "当前 master 的 Hadoop hosts 配置块:"
sed -n "/${HOSTS_BEGIN}/,/${HOSTS_END}/p" /etc/hosts
local i host ip
for ((i = 0; i < $(worker_count); i++)); do
host="${WORKER_HOSTS[$i]}"
ip="${WORKER_IPS[$i]}"
explain "同步 /etc/hosts 到 ${host}。这里继续使用 IP,避免 hosts 文件尚未生效时解析失败。"
echo "+ SSHPASS=****** sshpass -e scp /etc/hosts root@${ip}:/etc/hosts"
SSHPASS="$ROOT_PASSWORD" sshpass -e scp $SSH_OPTS /etc/hosts "root@${ip}:/etc/hosts"
done
run_cmd "在 master 上验证主机名解析。" "for h in $(printf '%q ' "$MASTER_HOST" "${WORKER_HOSTS[@]}"); do getent hosts \"\$h\"; done"
}
setup_ssh_key() {
section "配置 SSH 免密登录"
explain "Hadoop 的 start-dfs.sh 和 start-yarn.sh 会从 master 通过 SSH 到各 worker 启动服务,所以 master 必须能免密登录所有节点。"
mkdir -p /root/.ssh
chmod 700 /root/.ssh
if [[ ! -f /root/.ssh/id_rsa ]]; then
run_cmd "生成 master 节点 SSH 密钥。-N '' 表示不设置私钥口令,便于 Hadoop 自动启动远程服务。" "ssh-keygen -t rsa -b 4096 -N '' -f /root/.ssh/id_rsa"
else
say "检测到 /root/.ssh/id_rsa 已存在,跳过生成密钥。"
fi
local target
for target in "$MASTER_HOST" "${WORKER_HOSTS[@]}"; do
if [[ "$target" == "$MASTER_HOST" ]]; then
explain "把 master 的公钥加入本机 authorized_keys,保证 master 也能免密 SSH 到自己。"
echo "+ install local public key into /root/.ssh/authorized_keys"
touch /root/.ssh/authorized_keys
chmod 600 /root/.ssh/authorized_keys
grep -qxF "$(cat /root/.ssh/id_rsa.pub)" /root/.ssh/authorized_keys || cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
else
explain "把 master 的公钥分发给 ${target},后续即可免密登录。"
echo "+ SSHPASS=****** sshpass -e ssh-copy-id root@${target}"
SSHPASS="$ROOT_PASSWORD" sshpass -e ssh-copy-id $SSH_OPTS "root@${target}" >/dev/null
fi
done
section "验证 SSH 免密登录"
for target in "$MASTER_HOST" "${WORKER_HOSTS[@]}"; do
remote_cmd "$target" "验证 master 到 ${target} 是否无需密码即可执行命令。" "echo SSH to ${target} succeeded"
done
}
stop_hadoop_services_best_effort() {
section "停止已有 Hadoop 服务"
explain "全新覆盖部署前先尝试停止旧服务,避免清理目录时仍有进程占用文件。即使当前没有安装 Hadoop,这一步失败也不会中断。"
if [[ -x "${HADOOP_HOME_DIR}/sbin/stop-yarn.sh" || -x "${HADOOP_HOME_DIR}/sbin/stop-dfs.sh" ]]; then
bash -lc "source ${PROFILE_FILE} 2>/dev/null || true; mapred --daemon stop historyserver 2>/dev/null || true; stop-yarn.sh 2>/dev/null || true; stop-dfs.sh 2>/dev/null || true" || true
else
say "未检测到可用的 Hadoop 停止脚本,跳过。"
fi
}
cleanup_old_installation() {
section "清理旧安装和旧 HDFS 数据"
explain "这是教学环境的全新覆盖安装策略。会删除 /opt/java8、/opt/hadoop 以及旧安装包缓存;NameNode 和 DataNode 的旧数据也会被清理。"
local cleanup_cmd
cleanup_cmd="rm -rf ${JAVA_HOME_DIR} ${HADOOP_HOME_DIR} ${SOFTWARE_DIR:?}/${JAVA_ARCHIVE} ${SOFTWARE_DIR:?}/${HADOOP_ARCHIVE} ${PROFILE_FILE}"
run_cmd "清理 master 上的旧目录和安装包。" "mkdir -p ${SOFTWARE_DIR}; ${cleanup_cmd}"
local host
for host in "${WORKER_HOSTS[@]}"; do
remote_cmd "$host" "清理 ${host} 上的旧目录和安装包。" "mkdir -p ${SOFTWARE_DIR}; ${cleanup_cmd}"
done
}
download_with_resume() {
local url="$1"
local output="$2"
local description="$3"
explain "$description"
echo "+ curl -L --fail --retry 3 -C - -o ${output} ${url}"
curl -L --fail --retry 3 -C - -o "$output" "$url"
}
install_java_and_hadoop() {
section "下载并安装 Java 与 Hadoop"
explain "Hadoop 使用 Java 编写,所以必须先安装 JDK。这里沿用实验报告中的 OpenJDK 8 与 Hadoop 3.3.6。"
mkdir -p "$SOFTWARE_DIR"
download_with_resume "$JAVA_URL" "${SOFTWARE_DIR}/${JAVA_ARCHIVE}" "下载 OpenJDK 8 安装包。"
download_with_resume "$HADOOP_URL" "${SOFTWARE_DIR}/${HADOOP_ARCHIVE}" "下载 Hadoop ${HADOOP_VERSION} 安装包。"
run_cmd "解压 OpenJDK 到 /opt,并重命名为 /opt/java8。" "tar -zxf ${SOFTWARE_DIR}/${JAVA_ARCHIVE} -C /opt && mv /opt/${JAVA_DIR_NAME} ${JAVA_HOME_DIR}"
run_cmd "解压 Hadoop 到 /opt,并重命名为 /opt/hadoop。" "tar -zxf ${SOFTWARE_DIR}/${HADOOP_ARCHIVE} -C /opt && mv /opt/hadoop-${HADOOP_VERSION} ${HADOOP_HOME_DIR}"
}
write_profile() {
section "配置环境变量"
explain "把 Java 和 Hadoop 命令加入 PATH 后,学生可以在任意目录执行 java、hadoop、hdfs、start-dfs.sh 等命令。这里写入 /etc/profile.d 独立文件,避免修改整份 /etc/profile。"
cat > "$PROFILE_FILE" <<EOF_PROFILE
export JAVA_HOME=${JAVA_HOME_DIR}
export HADOOP_HOME=${HADOOP_HOME_DIR}
export PATH=\$PATH:\$JAVA_HOME/bin:\$HADOOP_HOME/bin:\$HADOOP_HOME/sbin
EOF_PROFILE
chmod 0644 "$PROFILE_FILE"
run_cmd "验证 Java 版本。" "source ${PROFILE_FILE}; java -version"
run_cmd "验证 Hadoop 版本。" "source ${PROFILE_FILE}; hadoop version"
}
write_xml_config() {
local file="$1"
shift
{
echo '<?xml version="1.0" encoding="UTF-8"?>'
echo '<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>'
echo '<configuration>'
while (( "$#" )); do
local name="$1"
local value="$2"
local comment="$3"
shift 3
echo " <!-- ${comment} -->"
echo " <property>"
echo " <name>${name}</name>"
echo " <value>${value}</value>"
echo " </property>"
done
echo '</configuration>'
} > "$file"
}
configure_hadoop_files() {
section "生成 Hadoop 分布式配置文件"
explain "这些 XML 文件决定 HDFS、YARN、MapReduce 的角色位置和运行参数。脚本根据前面输入的主机名自动生成配置。"
local conf_dir="${HADOOP_HOME_DIR}/etc/hadoop"
write_xml_config "${conf_dir}/core-site.xml" \
"fs.defaultFS" "hdfs://${MASTER_HOST}:8020" "指定 HDFS 文件系统主节点地址和端口。" \
"hadoop.tmp.dir" "${HADOOP_HOME_DIR}/hdfs/tmp" "指定 Hadoop 临时文件存储目录。" \
"hadoop.proxyuser.root.hosts" "*" "允许 root 用户通过代理访问任意主机,便于教学实验。" \
"hadoop.proxyuser.root.groups" "*" "允许 root 用户通过代理访问任意用户组,便于教学实验。"
write_xml_config "${conf_dir}/hdfs-site.xml" \
"dfs.replication" "$REPLICATION" "设置 HDFS 数据块副本数。" \
"dfs.namenode.name.dir" "${HADOOP_HOME_DIR}/hdfs/name" "指定 NameNode 元数据存储目录。" \
"dfs.datanode.data.dir" "${HADOOP_HOME_DIR}/hdfs/data" "指定 DataNode 数据块存储目录。" \
"dfs.namenode.http-address" "${MASTER_HOST}:9870" "设置 NameNode Web 管理界面地址。" \
"dfs.namenode.secondary.http-address" "${SECONDARY_HOST}:9868" "设置 SecondaryNameNode Web 管理界面地址。" \
"dfs.permissions" "false" "关闭 HDFS 权限检查,降低教学实验复杂度。"
write_xml_config "${conf_dir}/yarn-site.xml" \
"yarn.resourcemanager.hostname" "$MASTER_HOST" "指定 ResourceManager 所在节点。" \
"yarn.nodemanager.aux-services" "mapreduce_shuffle" "启用 MapReduce shuffle 服务。" \
"yarn.log.server.url" "http://${MASTER_HOST}:19888/jobhistory/logs" "设置历史日志服务器地址。" \
"yarn.log-aggregation-enable" "true" "开启 YARN 日志聚集。"
write_xml_config "${conf_dir}/mapred-site.xml" \
"mapreduce.framework.name" "yarn" "指定 MapReduce 程序运行在 YARN 上。" \
"mapreduce.jobhistory.address" "${MASTER_HOST}:10020" "设置 JobHistoryServer RPC 地址。" \
"mapreduce.jobhistory.webapp.address" "${MASTER_HOST}:19888" "设置 JobHistoryServer Web 地址。" \
"yarn.app.mapreduce.am.env" "HADOOP_MAPRED_HOME=\${HADOOP_HOME}" "设置 MapReduce ApplicationMaster 环境变量。" \
"mapreduce.map.env" "HADOOP_MAPRED_HOME=\${HADOOP_HOME}" "设置 Map 任务环境变量。" \
"mapreduce.reduce.env" "HADOOP_MAPRED_HOME=\${HADOOP_HOME}" "设置 Reduce 任务环境变量。"
{
echo "export JAVA_HOME=${JAVA_HOME_DIR}"
echo "export HADOOP_HOME=${HADOOP_HOME_DIR}"
echo "export HDFS_NAMENODE_USER=root"
echo "export HDFS_DATANODE_USER=root"
echo "export HDFS_SECONDARYNAMENODE_USER=root"
echo "export YARN_RESOURCEMANAGER_USER=root"
echo "export YARN_NODEMANAGER_USER=root"
} >> "${conf_dir}/hadoop-env.sh"
printf '%s\n' "${WORKER_HOSTS[@]}" > "${conf_dir}/workers"
echo "workers 文件内容:"
cat "${conf_dir}/workers"
run_cmd "在 master 创建 NameNode 元数据目录。" "mkdir -p ${HADOOP_HOME_DIR}/hdfs/name ${HADOOP_HOME_DIR}/hdfs/tmp"
}
create_worker_hdfs_dirs() {
section "创建 worker HDFS 数据目录"
explain "DataNode 会把 HDFS 数据块存放在各 worker 的数据目录中。软件同步完成后再创建目录,可以避免 rsync 覆盖安装时把空目录清掉。"
local host
for host in "${WORKER_HOSTS[@]}"; do
remote_cmd "$host" "创建 DataNode 数据目录。" "mkdir -p ${HADOOP_HOME_DIR}/hdfs/data ${HADOOP_HOME_DIR}/hdfs/tmp"
done
}
sync_installation_to_workers() {
section "同步 Java、Hadoop 和环境变量到 worker"
explain "为了保证集群环境一致,master 安装并配置完成后,把相同目录同步到所有 worker。"
local host
for host in "${WORKER_HOSTS[@]}"; do
copy_to_remote "${JAVA_HOME_DIR}/" "$host" "${JAVA_HOME_DIR}/" "同步 Java 到 ${host}。"
copy_to_remote "${HADOOP_HOME_DIR}/" "$host" "${HADOOP_HOME_DIR}/" "同步 Hadoop 到 ${host}。"
explain "同步环境变量文件到 ${host}。"
echo "+ scp ${PROFILE_FILE} root@${host}:${PROFILE_FILE}"
scp $SSH_OPTS "$PROFILE_FILE" "root@${host}:${PROFILE_FILE}"
remote_cmd "$host" "验证 Java 与 Hadoop 命令可用。" "source ${PROFILE_FILE}; java -version; hadoop version | head -n 1"
done
}
format_namenode() {
section "格式化 NameNode"
explain "格式化 NameNode 会初始化 HDFS 命名空间。全新部署时必须执行;重复格式化会清空 HDFS 元数据,所以本脚本只在全新覆盖流程中执行。"
run_cmd "执行 NameNode 格式化。" "source ${PROFILE_FILE}; hdfs namenode -format -force"
}
start_cluster() {
ensure_config_loaded_or_collect
section "启动 Hadoop 分布式集群"
explain "启动顺序为 HDFS -> YARN -> JobHistoryServer。HDFS 提供分布式存储,YARN 提供资源调度,历史服务器记录 MapReduce 作业历史。"
run_cmd "启动 HDFS。" "source ${PROFILE_FILE}; start-dfs.sh"
run_cmd "启动 YARN。" "source ${PROFILE_FILE}; start-yarn.sh"
run_cmd "启动 MapReduce JobHistoryServer。" "source ${PROFILE_FILE}; mapred --daemon start historyserver"
check_status
print_web_urls
}
stop_cluster() {
ensure_config_loaded_or_collect
section "停止 Hadoop 分布式集群"
explain "停止顺序为 JobHistoryServer -> YARN -> HDFS。先停上层计算服务,再停底层存储服务,能减少异常关闭带来的干扰。"
run_cmd "停止 MapReduce JobHistoryServer。" "source ${PROFILE_FILE}; mapred --daemon stop historyserver || true"
run_cmd "停止 YARN。" "source ${PROFILE_FILE}; stop-yarn.sh || true"
run_cmd "停止 HDFS。" "source ${PROFILE_FILE}; stop-dfs.sh || true"
check_jps_all_nodes
}
check_jps_all_nodes() {
section "检查各节点 Java 进程"
explain "jps 会列出当前节点上的 Java 进程。NameNode、DataNode、ResourceManager、NodeManager 等都应该能在对应节点看到。"
run_cmd "查看 master 上的进程。" "source ${PROFILE_FILE} 2>/dev/null || true; jps || true"
local host
for host in "${WORKER_HOSTS[@]}"; do
remote_cmd "$host" "查看 ${host} 上的进程。" "source ${PROFILE_FILE} 2>/dev/null || true; jps || true"
done
}
check_status() {
ensure_config_loaded_or_collect
check_jps_all_nodes
section "检查 HDFS 和 YARN 节点状态"
explain "dfsadmin -report 用来查看 DataNode 是否在线;yarn node -list 用来查看 NodeManager 是否加入集群。"
run_cmd "查看 HDFS DataNode 报告。" "source ${PROFILE_FILE}; hdfs dfsadmin -report || true"
run_cmd "查看 YARN NodeManager 列表。" "source ${PROFILE_FILE}; yarn node -list || true"
}
print_web_urls() {
section "Web 管理界面地址"
explain "这些页面用于观察集群状态。若公网无法访问,请优先检查阿里云安全组端口是否开放。"
echo "HDFS NameNode: http://${MASTER_PUBLIC_IP}:9870/"
echo "YARN ResourceManager: http://${MASTER_PUBLIC_IP}:8088/"
echo "JobHistoryServer: http://${MASTER_PUBLIC_IP}:19888/"
}
full_deploy() {
collect_cluster_info
save_config
check_local_environment
install_base_tools
disable_firewalld
verify_password_ssh_access
configure_hostnames
configure_hosts_files
setup_ssh_key
stop_hadoop_services_best_effort
cleanup_old_installation
install_java_and_hadoop
write_profile
configure_hadoop_files
sync_installation_to_workers
create_worker_hdfs_dirs
format_namenode
start_cluster
}
clean_reinstall() {
section "清理重装"
explain "清理重装会重新收集集群信息、停止服务、删除旧目录,然后完整执行一次全新部署。"
full_deploy
}
show_menu() {
echo
line
echo "Hadoop 分布式集群自动部署教学脚本"
line
echo "1) 全新部署"
echo "2) 启动集群"
echo "3) 停止集群"
echo "4) 状态检查"
echo "5) 清理重装"
echo "0) 退出"
}
main() {
require_root
init_logging
say "日志文件:${log_file}"
warn "本脚本面向教学实验环境,会在全新部署/清理重装时删除旧 Hadoop、Java 和 HDFS 数据目录。"
while true; do
show_menu
local choice
read -r -p "请选择操作:" choice
case "$choice" in
1) full_deploy; pause_for_reading ;;
2) start_cluster; pause_for_reading ;;
3) stop_cluster; pause_for_reading ;;
4) check_status; print_web_urls; pause_for_reading ;;
5) clean_reinstall; pause_for_reading ;;
0) say "退出脚本。"; break ;;
*) echo "无效选项,请重新输入。" ;;
esac
done
}
main "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment