Skip to content

Instantly share code, notes, and snippets.

@wileyj
Last active February 13, 2025 15:39
Show Gist options
  • Save wileyj/e84eb1e9bb0784f941483c50570587f0 to your computer and use it in GitHub Desktop.
Save wileyj/e84eb1e9bb0784f941483c50570587f0 to your computer and use it in GitHub Desktop.
#!/bin/bash
set -o pipefail
## Using 10 cpu cores, a full replay will take between 12-14 hours (assuming there are no other cpu/io bound processes running at the same time)
##
## ** Recommend to run this script in screen or tmux **
##
## We'll need ~73GB per slice, plus an extra ~400GB for the chainstate archive and marf DB
## as of 02/2025:
## for 10 slices, this is about 1.1TB
## - 149GB for compressed chainstate
## - 232GB decompressed marf db
## - 73GB per slice dir (1 dir per cpu)
## for 15 slices, this is about 1.46TB
## for 20 slices, this is about 1.8TB
NETWORK="mainnet" ## network to replay
REPO_DIR="$HOME/stacks-inspect" ## where to build the source
REMOTE_REPO="stacks-network/stacks-core" ## remote git repo to build stacks-inspect from
SCRATCH_DIR="$HOME/scratch" ## root folder for the replay slices
TIMESTAMP=$(date +%Y-%m-%d-%s) ## use a simple date format year-month-day-epoch
LOG_DIR="/tmp/replay_${TIMESTAMP}" ## location of logfiles for the replay
SLICE_DIR="${SCRATCH_DIR}/slice" ## location of slice dirs
TMUX_SESSION="replay" ## tmux session name to run the replay
TERM_OUT=false ## terminal friendly output
TESTING=false ## only run a replay on a few thousand blocks
UPLOAD=false ## default to not upload to s3
BRANCH="develop" ## default branch to build stacks-inspect from
S3_BUCKET="xxxxxxxxxxxxx" ## public s3 bucket to upload results to
S3_ROOT_FOLDER="results" ## s3 root folder
CORES=$(grep -c processor /proc/cpuinfo) ## retrieve total number of CORES on the system
RESERVED=10 ## reserve this many CORES for other processes as default
## ansi color codes for terminal output
COLRED=$'\033[31m' ## Red
COLGREEN=$'\033[32m' ## Green
COLYELLOW=$'\033[33m' ## Yellow
COLCYAN=$'\033[36m' ## Cyan
COLBOLD=$'\033[1m' ## Bold Text
COLRESET=$'\033[0m' ## reset color/formatting
## verify that cargo is installed in the expected path, not only $PATH
install_cargo() {
command -v "$HOME/.cargo/bin/cargo" >/dev/null 2>&1 || {
echo "Installing Rust via rustup"
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y || {
echo "${COLRED}Error${COLRESET} installing Rust"
exit 1
}
}
echo "Exporting $HOME/.cargo/env"
# shellcheck source=/dev/null
source "$HOME/.cargo/env"
return 0
}
## build stacks-inspect binary from specified repo/branch
build_stacks_inspect() {
if [ -d "${REPO_DIR}" ];then
echo "Found ${COLYELLOW}${REPO_DIR}${COLRESET}. checking out ${COLGREEN}${BRANCH}${COLRESET} and resetting to ${COLBOLD}HEAD${COLRESET}"
cd "${REPO_DIR}" && git fetch
echo "Checking out ${BRANCH} and resetting to HEAD"
git stash ## stash any local changes to prevent checking out $BRANCH
(git checkout "${BRANCH}" && git reset --hard HEAD) || {
echo "${COLRED}Error${COLRESET} checking out ${BRANCH}"
exit 1
}
else
echo "Cloning stacks-core ${BRANCH}"
(git clone "https://github.com/${REMOTE_REPO}" --branch "${BRANCH}" "${REPO_DIR}" && cd "${REPO_DIR}") || {
echo "${COLRED}Error${COLRESET} cloning https://github.com/${REMOTE_REPO} into ${REPO_DIR}"
exit 1
}
fi
git pull
## build stacks-inspect to: $HOME/stacks-inspect/target/release/stacks-inspect
echo "Building stacks-inspect binary"
cargo build --bin=stacks-inspect --release || {
echo "${COLRED}Error${COLRESET} building stacks-inspect binary"
exit 1
}
echo "Done building. continuing"
}
## create the slice dirs from an chainstate archive (symlinking marf.sqlite.blobs), 1 dir per CPU
configure_replay_slices() {
if [ -d "$HOME/scratch" ]; then
echo "Deleting existing scratch dir: ${COLYELLOW}$HOME/scratch${COLRESET}"
rm -rf "${HOME}/scratch" || {
echo "${COLRED}Error${COLRESET} deleting dir $HOME/scratch"
exit 1
}
fi
echo "Creating scratch and slice dirs"
(mkdir -p "${SLICE_DIR}0" && cd "${SCRATCH_DIR}") || {
echo "${COLRED}Error${COLRESET} creating dir ${SLICE_DIR}"
exit 1
}
echo "Downloading latest ${NETWORK} chainstate archive ${COLYELLOW}https://archive.hiro.so/${NETWORK}/stacks-blockchain/${NETWORK}-stacks-blockchain-latest.tar.gz${COLRESET}"
## curl had some random issues retrying the download when network issues arose. wget has resumed more consistently, so we'll use that binary
# curl -L --proto '=https' --tlsv1.2 https://archive.hiro.so/${NETWORK}/stacks-blockchain/${NETWORK}-stacks-blockchain-latest.tar.gz -o ${SCRATCH_DIR}/${NETWORK}-stacks-blockchain-latest.tar.gz || {
wget -O "${SCRATCH_DIR}/${NETWORK}-stacks-blockchain-latest.tar.gz" "https://archive.hiro.so/${NETWORK}/stacks-blockchain/${NETWORK}-stacks-blockchain-latest.tar.gz" || {
echo "${COLRED}Error${COLRESET} downlaoding latest ${NETWORK} chainstate archive"
exit 1
}
## extract downloaded archive
echo "Extracting downloaded archive: ${COLYELLOW}${SCRATCH_DIR}/${NETWORK}-stacks-blockchain-latest.tar.gz${COLRESET}"
tar --strip-components=1 -xzf "${SCRATCH_DIR}/${NETWORK}-stacks-blockchain-latest.tar.gz" -C "${SLICE_DIR}0" || {
echo "${COLRED}Error${COLRESET} extracting ${NETWORK} chainstate archive"
exit
}
echo "Moving marf database: ${SLICE_DIR}0/chainstate/vm/clarity/marf.sqlite.blobs -> ${COLYELLOW}${SCRATCH_DIR}/marf.sqlite.blobs${COLRESET}"
mv "${SLICE_DIR}"0/chainstate/vm/clarity/marf.sqlite.blobs "${SCRATCH_DIR}"/
echo "Symlinking marf database: ${SCRATCH_DIR}/marf.sqlite.blobs -> ${COLYELLOW}${SLICE_DIR}0/chainstate/vm/clarity/marf.sqlite.blobs${COLRESET}"
ln -s "${SCRATCH_DIR}"/marf.sqlite.blobs "${SLICE_DIR}"0/chainstate/vm/clarity/marf.sqlite.blobs || {
echo "${COLRED}Error${COLRESET} creating symlink: ${SCRATCH_DIR}/marf.sqlite.blobs -> ${SLICE_DIR}0/chainstate/vm/clarity/marf.sqlite.blobs"
exit 1
}
## create a copy of the linked db with <number of CORES><number of RESERVED CORES>
## decrement by 1 since we already have ${SLICE_DIR}0
for ((i=1;i<=$(( CORES - RESERVED - 1 ));i++)); do
echo "Copying ${SLICE_DIR}0 -> ${COLYELLOW}${SLICE_DIR}${i}${COLRESET}"
cp -R "${SLICE_DIR}0" "${SLICE_DIR}${i}" || {
echo "${COLRED}Error${COLRESET} copying ${SLICE_DIR}0 -> ${SLICE_DIR}${i}"
exit 1
}
done
}
## setup the tmux sessions and create the logdir for storing output
setup_replay() {
## if there is an existing folder, rm it
if [ -d "${LOG_DIR}" ];then
echo "Removing logdir ${LOG_DIR}"
rm -rf "${LOG_DIR}"
fi
## create LOG_DIR to store output files
if [ ! -d "${LOG_DIR}" ]; then
echo "Creating logdir ${LOG_DIR}"
mkdir -p "${LOG_DIR}"
fi
## if tmux session "replay" exists, kill it and start anew
if eval "tmux list-windows -t ${TMUX_SESSION} &> /dev/null"; then
echo "Killing existing tmux session: ${TMUX_SESSION}"
eval "tmux kill-session -t ${TMUX_SESSION} &> /dev/null"
fi
local slice_counter=0
## create tmux session named ${TMUX_SESSION} with a window named slice0
tmux new-session -d -s ${TMUX_SESSION} -n slice${slice_counter} || {
echo "${COLRED}Error${COLRESET} creating tmux session ${COLYELLOW}${TMUX_SESSION}${COLRESET}"
exit 1
}
if [ ! -f "${SLICE_DIR}0/chainstate/vm/index.sqlite" ]; then
echo "${COLRED}Error${COLRESET}: chainstate db not found (${SLICE_DIR}0/chainstate/vm/index.sqlite)"
exit 1
fi
return 0
}
## run the block replay
start_replay() {
local mode=$1
local total_blocks=0
local starting_block=0
local inspect_command
local slice_counter=0
case "$mode" in
nakamoto)
## nakamoto blocks
echo "Mode: ${COLYELLOW}${mode}${COLRESET}"
local log_append="_${mode}"
inspect_command="replay-naka-block"
## get the total number of nakamoto blocks in db
total_blocks=$(echo "select count(*) from nakamoto_block_headers" | sqlite3 "${SLICE_DIR}"0/chainstate/vm/index.sqlite)
starting_block=0 # for the block counter, start at this block
## use these values if `--testing` arg is provided (only replay 100 blocks)
${TESTING} && total_blocks=301883
${TESTING} && starting_block=301783
;;
*)
## pre-nakamoto blocks
echo "Mode: ${COLYELLOW}pre-nakamoto${COLRESET}"
local log_append=""
inspect_command="replay-block"
## get the total number of blocks (with orphans) in db
total_blocks=$(echo "select count(*) from staging_blocks where orphaned = 0" | sqlite3 "${SLICE_DIR}"0/chainstate/vm/index.sqlite)
starting_block=0 # for the block counter, start at this block
## use these values if `--testing` arg is provided (only replay 100 blocks) Note: 2.5 epoch is at 153106
${TESTING} && total_blocks=153000
${TESTING} && starting_block=152900
;;
esac
local block_diff=$((total_blocks - starting_block)) ## how many blocks are being replayed
local slices=$((CORES - RESERVED)) ## how many replay slices to use
local slice_blocks=$((block_diff / slices)) ## how many blocks to replay per slice
${TESTING} && echo "${COLRED}Testing: ${TESTING}${COLRESET}"
echo "Total blocks: ${COLYELLOW}${total_blocks}${COLRESET}"
echo "Staring Block: ${COLYELLOW}$starting_block${COLRESET}"
echo "Block diff: ${COLYELLOW}$block_diff${COLRESET}"
echo "******************************************************"
echo "Total slices: ${COLYELLOW}${slices}${COLRESET}"
echo "Blocks per slice: ${COLYELLOW}${slice_blocks}${COLRESET}"
local end_block_count=$starting_block
while [[ ${end_block_count} -lt ${total_blocks} ]]; do
local start_block_count=$end_block_count
end_block_count=$((end_block_count + slice_blocks))
if [[ "${end_block_count}" -gt "${total_blocks}" ]] || [[ "${slice_counter}" -eq $((slices - 1)) ]]; then
end_block_count="${total_blocks}"
fi
if [ "${mode}" != "nakamoto" ]; then ## don't create the tmux windows if we're replaying nakamoto blocks (they should already exist). TODO: check if it does exist in case the function call order changes
if [ "${slice_counter}" -gt 0 ];then
tmux new-window -t replay -d -n "slice${slice_counter}" || {
echo "${COLRED}Error${COLRESET} creating tmux window ${COLYELLOW}slice${slice_counter}${COLRESET}"
exit 1
}
fi
fi
local log_file="${LOG_DIR}/slice${slice_counter}${log_append}.log"
local log=" | tee -a ${log_file}"
local cmd="${REPO_DIR}/target/release/stacks-inspect --config ${REPO_DIR}/stackslib/conf/${NETWORK}-follower-conf.toml ${inspect_command} ${SLICE_DIR}${slice_counter} index-range $start_block_count $end_block_count 2>/dev/null"
echo " Creating tmux window: ${COLGREEN}replay:slice${slice_counter}${COLRESET} :: Blocks: ${COLYELLOW}${start_block_count}-${end_block_count}${COLRESET} || Logging to: ${log_file}"
echo "Command: ${cmd}" > "${log_file}" ## log the command being run for the slice
echo "Replaying indexed blocks: ${start_block_count}-${end_block_count} (out of ${total_blocks})" >> "${log_file}"
## send `cmd` to the tmux window where the replay will run
tmux send-keys -t "${TMUX_SESSION}:slice${slice_counter}" "${cmd}${log}" Enter || {
echo "${COLRED}Error${COLRESET} sending replay command to tmux window ${COLYELLOW}slice${slice_counter}${COLRESET}"
exit 1
}
## log the return code as the last line
tmux send-keys -t "${TMUX_SESSION}:slice${slice_counter}" "echo \${PIPESTATUS[0]} >> ${log_file}" Enter || {
echo "${COLRED}Error${COLRESET} sending return status command to tmux window ${COLYELLOW}slice${slice_counter}${COLRESET}"
exit 1
}
slice_counter=$((slice_counter + 1))
done
check_progress
}
## pretty print the status output (simple spinner while pids are active)
check_progress() {
# give the pids a few seconds to show up in process table before checking if they're running
local sleep_duration=5
local progress=1
local sp="/-\|"
local count
while [ $sleep_duration -gt 0 ]; do
${TERM_OUT} && printf "Sleeping ... \b [ %s%s%s ] \033[0K\r" "${COLYELLOW}" "${sleep_duration}" "${COLRESET}"
sleep_duration=$((sleep_duration-1))
sleep 1
done
echo "************************************************************************"
echo "Checking Block Replay status"
echo -e ' '
while true; do
count=$(pgrep -c "stacks-inspect")
if [ "${count}" -gt 0 ]; then
${TERM_OUT} && printf "Block replay processes are currently active [ %s%s%s%s ] ... \b${sp:progress++%${#sp}:1} \033[0K\r" "${COLYELLOW}" "${COLBOLD}" "${count}" "${COLRESET}"
else
${TERM_OUT} && printf "\r\n"
break
fi
done
echo "************************************************************************"
}
## store the results in an aggregated logfile and an html file
store_results() {
## text file to store results
local results="${LOG_DIR}/results.log"
## html file to store results
local results_html="${LOG_DIR}/results.html"
local failed=0;
local return_code=0;
local failure_count
echo "Results: ${COLYELLOW}${results}${COLRESET}"
cd "${LOG_DIR}" || {
echo "${COLRED}Error${COLRESET} Logdir ${COLYELLOW}${LOG_DIR}${COLRESET} doesn't exist"
exit 1
}
## retrieve the count of all lines with `Failed processing block`
failure_count=$(grep -rc "Failed processing block" slice*.log | awk -F: '$NF >= 0 {x+=$NF; $NF=""} END{print x}')
if [ "${failure_count}" -gt 0 ]; then
echo "Failures: ${COLRED}${failure_count}${COLRESET}"
else
echo "Failures: ${COLGREEN}${failure_count}${COLRESET}"
fi
echo "Failures: ${failure_count}" > "${results}"
## check the return codes to see if we had a panic
for file in $(find . -name "slice*.log" -printf '%P\n' | sort); do
# for file in $(ls slice*.log | sort); do
echo "Checking file: ${COLYELLOW}$file${COLRESET}"
return_code=$(tail -1 "${file}")
case ${return_code} in
0)
# block replay ran successfully
;;
1)
# block replay had some block failures
failed=1
;;
*)
# return code likely indicates a panic
failed=1
echo "$file return code: $return_code" >> "${results}" # ok to continue if this write fails
;;
esac
done
## Store the results as HTML:
cat <<- _EOF_ > "${results_html}"
<body>
<style>
@import url('https://fonts.googleapis.com/css2?family=Source+Code+Pro:ital,wght@0,200..900;1,200..900&display=swap');
.container {
border: 1px outset black;
padding: 5px;
border-radius: 5px;
background-color: #eae9e8;
}
.fail {
background-color: #ffffff;
border: 1px outset black;
border-radius: 5px;
font-weight: 350;
}
.pass {
background-color: #eae9e8;
}
.result {
text-align: left;
padding-left: 10px;
padding-top: 10px;
padding-bottom: 10px;
margin: 5px;
}
body {
font-family: "Source Code Pro", monospace;
font-optical-sizing: auto;
font-style: normal;
}
</style>
<h2>$(date -u)</h2>
<hr/>
<h2>Failures: ${failure_count}</h2>
<div class="container">
_EOF_
## use the $failed var here in case there is a panic, then $failure_count may show zero, but the replay was not successful
if [ ${failed} == "1" ];then
output=$(grep -r -h "Failed processing block" slice*.log)
IFS=$'\n'
for line in ${output}; do
echo " <div class=\"result fail\">${line}</div>" >> "${results_html}" || {
echo "${COLRED}Error${COLRESET} writing failure to: ${results_html}"
}
echo "${line}" >> "${results}" || {
echo "${COLRED}Error${COLRESET} writing failure to: ${results}"
}
done
else
echo " <div class=\"result\">Test Passed</div>" >> "${results_html}"
fi
echo " </div>" >> "${results_html}"
echo "</body>" >> "${results_html}"
}
## upload results to s3 for webpage display
upload_results() {
## upload the results folder to s3 in the format of year-month-day-epoch
local s3_folder_name
s3_folder_name=$(basename "${LOG_DIR}" | cut -f2 -d "_")
if [ ! -d "${LOG_DIR}" ]; then
echo "${COLRED}Error${COLRESET} - logdir (${COLYELLOW}${LOG_DIR}${COLRESET}) is missing."
echo "${COLYELLOW}Skipping logs upload${COLRESET}"
else
if eval "aws s3 ls s3://${S3_BUCKET}/${S3_ROOT_FOLDER} > /dev/null 2>&1"; then
## upload LOG_DIR results folder
aws s3 cp --recursive --cache-control 'no-cache' --content-type text/plain --metadata-directive REPLACE "${LOG_DIR}" "s3://${S3_BUCKET}/${S3_ROOT_FOLDER}/${s3_folder_name}" || {
echo "${COLRED}Error${COLRESET} Uploading ${LOG_DIR} folder to ${COLYELLOW}s3://${S3_BUCKET}/${S3_ROOT_FOLDER}/${s3_folder_name}${COLRESET}"
}
## re-upload LOG_DIR/results.html with metadata set to text/html
aws s3 cp --cache-control 'no-cache' --content-type text/html --metadata-directive REPLACE "${LOG_DIR}/results.html" "s3://${S3_BUCKET}/${S3_ROOT_FOLDER}/${s3_folder_name}/results.html" || {
echo "${COLRED}Error${COLRESET} re-uploading ${LOG_DIR}/results.html file to ${COLYELLOW}s3://${S3_BUCKET}/${S3_ROOT_FOLDER}/${s3_folder_name}/results.html${COLRESET}"
}
## re-upload LOG_DIR/results.html in bucket root as latest.html
aws s3 cp --cache-control 'no-cache' --content-type text/html --metadata-directive REPLACE "${LOG_DIR}/results.html" "s3://${S3_BUCKET}/latest.html" || {
echo "${COLRED}Error${COLRESET} re-uploading ${LOG_DIR}/results.html file to ${COLYELLOW}s3://${S3_BUCKET}/results/${s3_folder_name}/latest.html${COLRESET}"
}
fi
fi
}
## show usage and exit
usage() {
echo
echo "Usage:"
echo " ${COLBOLD}${0}${COLRESET}"
echo " ${COLYELLOW}--testing${COLRESET}: only check a small number of blocks"
echo " ${COLYELLOW}-t|--terminal${COLRESET}: more terminal friendly output"
echo " ${COLYELLOW}-u|--upload${COLRESET}: upload results to s3"
echo " ${COLYELLOW}-n|--network${COLRESET}: run block replay against specific network (default: mainnet)"
echo " ${COLYELLOW}-b|--branch${COLRESET}: branch of stacks-core to build stacks-inspect from (default: develop)"
echo " ${COLYELLOW}-r|--reserved${COLRESET}: how many cpu cores to reserve for system tasks"
echo
echo " ex: ${COLCYAN}${0} -t -u ${COLRESET}"
echo
exit 0
}
## install missing dependencies
for cmd in curl tmux git wget tar gzip grep cargo pgrep aws; do
command -v "${cmd}" >/dev/null 2>&1 || {
case "${cmd}" in
"cargo")
install_cargo
;;
"pgrep")
package="procps"
;;
"aws")
package="awscli"
;;
*)
package="${cmd}"
;;
esac
(sudo apt-get update && sudo apt-get install "${package}") || {
echo "${COLRED}Error${COLRESET} installing $package"
exit 1
}
}
done
## parse cmd-line args
while [ ${#} -gt 0 ]; do
case ${1} in
--testing)
# only replay 100 blocks
TESTING=true
;;
-t|--terminal)
# update terminal with progress (it's just printf to show in real-time that the replays are running)
TERM_OUT=true
;;
-u|--upload)
# required if uploading results to s3
UPLOAD=true
;;
-n|--network)
# required if not mainnet
if [ "${2}" == "" ]; then
echo "Missing required value for ${1}"
fi
NETWORK=${2}
shift
;;
-b|--branch)
# build from specific branch
if [ "${2}" == "" ]; then
echo "Missing required value for ${1}"
fi
BRANCH=${2}
shift
;;
-r|--RESERVED)
# reserve this many cpus for the system (default is 10)
if [ "${2}" == "" ]; then
echo "Missing required value for ${1}"
fi
if ! [[ "$2" =~ ^[0-9]+$ ]]; then
echo "ERROR: arg ($2) is not a number." >&2
exit 1
fi
RESERVED=${2}
shift
;;
-h|--help|--usage)
# show usage/options and exit
usage
;;
esac
shift
done
## clear display before starting
tput reset
echo "Replay Started: ${COLYELLOW}$(date)${COLRESET}"
build_stacks_inspect ## comment if using an existing chainstate/slice dir (ex: replay was performed already, and a second run is desired)
configure_replay_slices ## comment if using an existing chainstate/slice dir (ex: replay was performed already, and a second run is desired)
setup_replay ## configure logdir and tmux sessions
start_replay ## replay pre-nakamoto blocks (2.x)
start_replay nakamoto ## replay nakamoto blocks
store_results ## store aggregated results of replay
${UPLOAD} && upload_results ## only upload results if -u arg is supplied
echo "Replay finished: $(date)"
@wileyj
Copy link
Author

wileyj commented Aug 22, 2024

Requires ~73GB per slice, plus an extra ~400GB for the chainstate archive and marf DB
as of 02/2025:

  • for 10 slices, total disk used is ~1.1TB
    • 149GB for compressed chainstate
    • 232GB decompressed marf db
    • 73GB per slice dir (1 dir per cpu)
  • for 15 slices, this is about 1.46TB
  • for 20 slices, this is about 1.8TB

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment