Skip to content

Instantly share code, notes, and snippets.

@dgruber
Last active May 5, 2025 12:33
Show Gist options
  • Save dgruber/c880728f4002bfd6a0d360c7f6a27de1 to your computer and use it in GitHub Desktop.
Save dgruber/c880728f4002bfd6a0d360c7f6a27de1 to your computer and use it in GitHub Desktop.
#!/bin/sh
#
# Improved Open Cluster Scheduler Installation Script
# Works across Linux distributions
#
set -e # Exit on error
#set -u # Treat unset variables as errors
echo "Starting Open Cluster Scheduler installation..."
# Function to install packages based on the package manager
install_packages() {
local packages="git tar binutils sudo make wget bash"
local epel_installed=0
if command -v apt &> /dev/null; then
echo "Detected apt package manager"
sudo apt update
sudo apt install -y $packages
# On Ubuntu, the package names are libtirpc3 and libtirpc-dev
echo "Installing libtirpc packages..."
sudo apt install -y libtirpc3 libtirpc-dev
elif command -v dnf &> /dev/null; then
echo "Detected dnf package manager"
# First install dnf-plugins-core if not already installed
if ! rpm -q dnf-plugins-core &> /dev/null; then
echo "Installing dnf-plugins-core for repository management..."
sudo dnf install -y dnf-plugins-core
fi
# Try to install packages directly first
sudo dnf install -y $packages
echo "Enabling CRB/CodeReady repo for development packages..."
source /etc/os-release
if [ "${ID}" = "ol" ] && [[ "${VERSION_ID}" == 9* ]]; then
sudo dnf config-manager --set-enabled ol9_codeready_builder
else
sudo dnf config-manager --set-enabled crb || sudo dnf config-manager --set-enabled powertools || true
fi
# Install libtirpc and libtirpc-devel directly from the repository
echo "Installing libtirpc from repository..."
sudo dnf install -y libtirpc libtirpc-devel
# Try to install screen, if it fails, add EPEL repo
if ! sudo dnf install -y screen; then
echo "Screen package not found in default repositories, adding EPEL..."
# Install EPEL repository
if sudo dnf install -y epel-release; then
echo "EPEL repository installed successfully"
epel_installed=1
else
echo "WARNING: Failed to install EPEL repository"
fi
# Try to install screen again if EPEL was installed
if [ $epel_installed -eq 1 ]; then
if ! sudo dnf install -y screen; then
echo "WARNING: Screen package not available even with EPEL. Continuing without screen..."
fi
fi
fi
elif command -v yum &> /dev/null; then
echo "Detected yum package manager"
sudo yum install -y $packages
# Enable optional repositories
echo "Enabling required repositories..."
sudo yum install -y yum-utils
sudo yum-config-manager --enable powertools || sudo yum-config-manager --enable crb || true
# Install libtirpc directly
sudo yum install -y libtirpc libtirpc-devel
# Try to install screen, if it fails, add EPEL repo
if ! sudo yum install -y screen; then
echo "Screen package not found in default repositories, adding EPEL..."
# Install EPEL repository
if sudo yum install -y epel-release; then
echo "EPEL repository installed successfully"
epel_installed=1
else
echo "WARNING: Failed to install EPEL repository"
fi
# Try to install screen again if EPEL was installed
if [ $epel_installed -eq 1 ]; then
if ! sudo yum install -y screen; then
echo "WARNING: Screen package not available even with EPEL. Continuing without screen..."
fi
fi
fi
elif command -v pacman &> /dev/null; then
echo "Detected pacman package manager"
sudo pacman -Sy --noconfirm $packages screen libtirpc
elif command -v zypper &> /dev/null; then
echo "Detected zypper package manager"
# Detect distribution and version
if [ -f /etc/os-release ]; then
. /etc/os-release
DISTID=$ID
DISTVERSION=${VERSION_ID%%.*}
else
echo "ERROR: Cannot detect SUSE distribution version."
exit 1
fi
# Default package list
packages="git tar binutils sudo make wget bash screen libtirpc libtirpc-devel"
if [ "$DISTID" = "sles" ]; then
echo "Detected SUSE Linux Enterprise Server $DISTVERSION"
# Register Desktop Applications module FIRST
sudo SUSEConnect -p sle-module-desktop-applications/${VERSION_ID}/x86_64 || \
sudo SUSEConnect -p sle-module-desktop-applications/15/x86_64
# Register Development Tools module
sudo SUSEConnect -p sle-module-development-tools/${VERSION_ID}/x86_64 || \
sudo SUSEConnect -p sle-module-development-tools/15/x86_64
packages="git-core tar binutils sudo make wget bash screen libtirpc3 libtirpc-devel"
elif [ "$DISTID" = "opensuse-leap" ]; then
echo "Detected openSUSE Leap $DISTVERSION"
# On openSUSE, package names are as expected
packages="git tar binutils sudo make wget bash screen libtirpc3 libtirpc-devel"
else
echo "WARNING: Unknown SUSE variant; attempting with default package names."
fi
# Install packages with zypper
sudo zypper install -y --no-recommends $packages
# Check for errors if critical packages are missing
if ! rpm -q libtirpc-devel > /dev/null; then
echo "ERROR: libtirpc-devel could not be installed."
exit 1
fi
if ! command -v git &>/dev/null; then
echo "ERROR: git could not be installed."
exit 1
fi
else
echo "ERROR: Unsupported package manager. Please install the following packages manually:"
echo "$packages screen libtirpc libtirpc-devel"
exit 1
fi
}
# Setup directories
setup_directories() {
echo "Setting up installation directories..."
# No need to create download directory here as we re-create it in download_files
sudo mkdir -p /opt/ocs
}
# Download installation files
download_files() {
echo "Downloading Open Cluster Scheduler files..."
# Use the local directory for downloads
local download_dir="./ocs_downloads"
# Clean up existing downloads
echo "Cleaning up existing downloads..."
rm -rf "$download_dir"
mkdir -p "$download_dir"
cd "$download_dir"
# Open Cluster Scheduler 9.0.5
# lx-amd64 bin package
wget -q --show-progress -k --content-disposition 'https://www.hpc-gridware.com/download/10529/?tmstv=1745334305'
# ulx-amd64 bin package (old Linux, like CentOS 7)
wget -q --show-progress -k --content-disposition 'https://www.hpc-gridware.com/download/10535/?tmstv=1745334305'
# doc package
wget -q --show-progress -k --content-disposition 'https://www.hpc-gridware.com/download/10543/?tmstv=1745334305'
# common / scripts package
wget -q --show-progress -k --content-disposition 'https://www.hpc-gridware.com/download/10541/?tmstv=1745334305'
echo "Extracting files to installation directory..."
for file in ocs-*.tar.gz; do
if [ -f "$file" ]; then
echo " Extracting $file..."
sudo tar xpf "$file" -C /opt/ocs/
fi
done
cd - > /dev/null
}
# Create autoinstall template
create_autoinstall_template() {
local hostname=$(hostname)
local template_file="$(pwd)/autoinstall.template"
cat > "$template_file" << EOF
SGE_ROOT="/opt/ocs"
SGE_QMASTER_PORT="6444"
SGE_EXECD_PORT="6445"
SGE_ENABLE_SMF="false"
SGE_CLUSTER_NAME="p6444"
CELL_NAME="default"
ADMIN_USER="root"
QMASTER_SPOOL_DIR="/opt/ocs/default/spool/master"
EXECD_SPOOL_DIR="/opt/ocs/default/spool/execd"
GID_RANGE="20000-20200"
SPOOLING_METHOD="classic"
DB_SPOOLING_DIR="/opt/ocs/default/spool/bdb"
PAR_EXECD_INST_COUNT="20"
ADMIN_HOST_LIST="$hostname"
SUBMIT_HOST_LIST="$hostname"
EXEC_HOST_LIST="$hostname"
EXECD_SPOOL_DIR_LOCAL=""
HOSTNAME_RESOLVING="true"
SHELL_NAME="ssh"
COPY_COMMAND="scp"
DEFAULT_DOMAIN="none"
ADMIN_MAIL="none"
ADD_TO_RC="true"
SET_FILE_PERMS="true"
RESCHEDULE_JOBS="wait"
SCHEDD_CONF="3"
SHADOW_HOST=""
EXEC_HOST_LIST_RM=""
REMOVE_RC="false"
CSP_RECREATE="true"
CSP_COPY_CERTS="false"
CSP_COUNTRY_CODE="DE"
CSP_STATE="Germany"
CSP_LOCATION="Building"
CSP_ORGA="Organisation"
CSP_ORGA_UNIT="Organisation_unit"
CSP_MAIL_ADDRESS="[email protected]"
EOF
}
# Install Open Cluster Scheduler
install_ocs() {
echo "Installing Open Cluster Scheduler..."
export MOUNT_DIR="/opt/ocs"
export LD_LIBRARY_PATH=""
local template_file="$(pwd)/autoinstall.template"
local tmp_template_host="$(pwd)/template_host"
local current_user=$(whoami)
local tmp_config_script="/tmp/ocs_config_$$.sh"
# Check if already installed
if [ -d ${MOUNT_DIR}/default/common ]; then
echo "Open Cluster Scheduler seems to be already installed!"
echo "Starting Open Cluster Scheduler daemons."
${MOUNT_DIR}/default/common/sgemaster
${MOUNT_DIR}/default/common/sgeexecd
return 0
fi
echo "Open Cluster Scheduler is not yet installed in ${MOUNT_DIR}. Starting installation."
# Copy autoinstall template
sudo cp "$template_file" "${MOUNT_DIR}/"
# Fix filestat issue with Linux namespaces
cd "${MOUNT_DIR}"
sudo rm -f ./utilbin/lx-amd64/filestat
sudo sh -c 'echo "#!/bin/sh" > ./utilbin/lx-amd64/filestat'
sudo sh -c 'echo "echo root" >> ./utilbin/lx-amd64/filestat'
sudo chmod +x ./utilbin/lx-amd64/filestat
# Install qmaster and execd
local hostname=$(hostname)
# Create template_host in the current directory first, then copy to installation dir
sed "s:docker:${hostname}:g" "$template_file" > "$tmp_template_host"
sudo cp "$tmp_template_host" "${MOUNT_DIR}/template_host"
# Run the installation
cd "${MOUNT_DIR}"
# On more recent distros the rc directory is missing. Installing rc scripts, switching
# to systemd later.
# Rocky 9
sudo mkdir -p /etc/rc.d/rc3.d/
# openSUSE Leap 15.6
sudo mkdir -p /etc/rc.d/init.d/
sudo ./inst_sge -m -x -auto ./template_host
# Configure environment
if [ -f "${MOUNT_DIR}/default/common/settings.sh" ]; then
# Use . instead of source for POSIX compatibility
. "${MOUNT_DIR}/default/common/settings.sh"
# Create a temporary shell script to run with sudo
cat > "$tmp_config_script" << EOL
#!/bin/sh
# Source the settings file to set up the environment
. ${MOUNT_DIR}/default/common/settings.sh
# Enable root to submit jobs
qconf -sconf | sed -e 's:100:0:g' > ${MOUNT_DIR}/global
qconf -Mconf ${MOUNT_DIR}/global
# Allow 10 single-core jobs to be processed at once per node
qconf -rattr queue slots 10 all.q
# Make current user a manager
echo "Adding current user (${current_user}) as a manager..."
qconf -am "${current_user}"
# Add settings to root's bashrc
if ! grep -q "${MOUNT_DIR}/default/common/settings.sh" /root/.bashrc; then
echo ". ${MOUNT_DIR}/default/common/settings.sh" >> /root/.bashrc
fi
EOL
# Make the script executable
chmod +x "$tmp_config_script"
# Run the configuration script with sudo
echo "Running OCS configuration..."
sudo "$tmp_config_script"
# Add settings to current user's bashrc if not already there
if ! grep -q "${MOUNT_DIR}/default/common/settings.sh" "$HOME/.bashrc"; then
echo "" >> "$HOME/.bashrc"
echo "# Open Cluster Scheduler settings" >> "$HOME/.bashrc"
echo ". ${MOUNT_DIR}/default/common/settings.sh" >> "$HOME/.bashrc"
fi
# Clean up temporary script
rm -f "$tmp_config_script"
else
echo "ERROR: Installation failed. Could not find settings.sh"
exit 1
fi
# Clean up temporary files
rm -f "$tmp_template_host"
echo "Open Cluster Scheduler installation completed successfully!"
echo "Current user ($current_user) has been added as a manager"
echo "Open Cluster Scheduler environment has been added to your ~/.bashrc"
echo "Please run: source ~/.bashrc or start a new terminal to use Open Cluster Scheduler commands"
}
# Main execution
main() {
install_packages
setup_directories
download_files
create_autoinstall_template
install_ocs
}
# Run the script
main
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment