Skip to content

Instantly share code, notes, and snippets.

@P3GLEG
Created February 10, 2025 09:45
Show Gist options
  • Save P3GLEG/09c7732cfff06cf2ec36c18f5e563ae9 to your computer and use it in GitHub Desktop.
Save P3GLEG/09c7732cfff06cf2ec36c18f5e563ae9 to your computer and use it in GitHub Desktop.
Hacky script to test model optimal settings on local device
#!/bin/bash
# Variables to iterate for
BATCH_SIZES=(512 1024 2048 4096)
CTX_SIZES=(4096 8192 16384 32768 65536 131072)
ROPE_SCALINGS=("linear" "yarn")
ROPE_SCALES=(1 1.5 2)
ROPE_FREQ_BASE=1000000
ROPE_FREQ_SCALE=1
# Define the log file to store summary results.
# The CSV now also includes the RoPE settings.
SUMMARY_LOG="summary_results.txt"
echo "Batch Size, Context Size, Sampling TPS, Prompt Eval TPS, Token Eval TPS, Peak CPU (%), Peak RAM, Peak GPU (%), Peak VRAM, RoPE Scaling, RoPE Scale, RoPE Freq Base, RoPE Freq Scale" > "$SUMMARY_LOG"
# Define model path and prompt
MODEL_PATH=""
PROMPT="I believe the meaning of life is"
for BATCH_SIZE in "${BATCH_SIZES[@]}"; do
for CTX in "${CTX_SIZES[@]}"; do
echo "Running with Batch Size: $BATCH_SIZE, Context Size: $CTX"
# (Optional) If you wish to skip overly high context lengths that produce N/A results,
# you can uncomment the following:
# if [ "$CTX" -eq 131072 ]; then
# echo "Skipping context size $CTX (results tend to be N/A)"
# continue
# fi
# Remove any existing testing.txt from previous run
rm -f testing.txt
# Run llama-cli in the background with explicit RoPE settings, and capture its PID.
./llama-cli \
-m "$MODEL_PATH" \
--ctx-size "$CTX" \
--batch-size "$BATCH_SIZE" \
-p "$PROMPT" \
--threads 24 \
-fa \
--seed 42 \
--gpu-layers 10000 \ # just random high number, doesnt matter unless you offloading to cpu
--cache-type-k q4_0 \
--cache-type-v q4_0 \
--rope-scaling "$ROPE_SCALING" \
--rope-scale "$ROPE_SCALE" \
--rope-freq-base "$ROPE_FREQ_BASE" \
--rope-freq-scale "$ROPE_FREQ_SCALE" \
-no-cnv \
--log-file testing.txt &
LLAMA_PID=$!
echo "Waiting for llama-cli (PID: $LLAMA_PID) to finish..."
# Track peak CPU and RAM usage while the process is running.
PEAK_CPU=0
PEAK_RAM=0
while ps -p "$LLAMA_PID" > /dev/null 2>&1; do
CURR_CPU=$(ps -p "$LLAMA_PID" -o %cpu --no-headers | awk '{print $1}')
CURR_RAM=$(ps -p "$LLAMA_PID" -o rss --no-headers | awk '{print $1}')
if [[ -n "$CURR_CPU" && $(echo "$CURR_CPU > $PEAK_CPU" | bc -l) -eq 1 ]]; then
PEAK_CPU=$CURR_CPU
fi
if [[ -n "$CURR_RAM" && $CURR_RAM -gt $PEAK_RAM ]]; then
PEAK_RAM=$CURR_RAM
fi
sleep 1
done
echo "llama-cli completed."
# Wait until testing.txt has some data.
while [[ ! -s testing.txt ]]; do
sleep 1
done
echo "Extracting performance metrics..."
# Use regex to extract the numeric value (tokens per second) from each relevant line.
SAMPLING_TPS=$(grep -oP 'sampling time.*?,\s*\K[\d.]+(?= tokens per second)' testing.txt | tail -n 1)
PROMPT_TPS=$(grep -oP 'prompt eval time.*?,\s*\K[\d.]+(?= tokens per second)' testing.txt | tail -n 1)
EVAL_TPS=$(grep -oP 'eval time.*?,\s*\K[\d.]+(?= tokens per second)' testing.txt | tail -n 1)
# If extraction failed, use "N/A"
[[ -z "$SAMPLING_TPS" ]] && SAMPLING_TPS="N/A"
[[ -z "$PROMPT_TPS" ]] && PROMPT_TPS="N/A"
[[ -z "$EVAL_TPS" ]] && EVAL_TPS="N/A"
# Convert PEAK_RAM from KB to human-readable MB/GB
if (( PEAK_RAM > 1048576 )); then
PEAK_RAM_HR="$(echo "scale=2; $PEAK_RAM/1048576" | bc) GB"
else
PEAK_RAM_HR="$(echo "scale=2; $PEAK_RAM/1024" | bc) MB"
fi
# Get peak GPU usage and VRAM via nvidia-smi.
PEAK_GPU=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits | sort -nr | head -n 1)
PEAK_VRAM=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | sort -nr | head -n 1)
if (( PEAK_VRAM > 10240 )); then
PEAK_VRAM_HR="$(echo "scale=2; $PEAK_VRAM/1024" | bc) GB"
else
PEAK_VRAM_HR="${PEAK_VRAM} MB"
fi
# Write the extracted values on a single line using printf.
# This line now also appends the RoPE settings.
printf "%s, %s, %s, %s, %s, %s%%, %s, %s%%, %s, %s, %s, %s, %s\n" \
"$BATCH_SIZE" "$CTX" "$SAMPLING_TPS" "$PROMPT_TPS" "$EVAL_TPS" "$PEAK_CPU" "$PEAK_RAM_HR" "$PEAK_GPU" "$PEAK_VRAM_HR" \
"$ROPE_SCALING" "$ROPE_SCALE" "$ROPE_FREQ_BASE" "$ROPE_FREQ_SCALE" >> "$SUMMARY_LOG"
echo "Finished test for Batch Size: $BATCH_SIZE, Context Size: $CTX"
echo "-----------------------------------------"
done
done
echo "All tests completed. Summary stored in $SUMMARY_LOG."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment