Created
February 10, 2025 09:45
-
-
Save P3GLEG/09c7732cfff06cf2ec36c18f5e563ae9 to your computer and use it in GitHub Desktop.
Hacky script to test model optimal settings on local device
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Variables to iterate for | |
BATCH_SIZES=(512 1024 2048 4096) | |
CTX_SIZES=(4096 8192 16384 32768 65536 131072) | |
ROPE_SCALINGS=("linear" "yarn") | |
ROPE_SCALES=(1 1.5 2) | |
ROPE_FREQ_BASE=1000000 | |
ROPE_FREQ_SCALE=1 | |
# Define the log file to store summary results. | |
# The CSV now also includes the RoPE settings. | |
SUMMARY_LOG="summary_results.txt" | |
echo "Batch Size, Context Size, Sampling TPS, Prompt Eval TPS, Token Eval TPS, Peak CPU (%), Peak RAM, Peak GPU (%), Peak VRAM, RoPE Scaling, RoPE Scale, RoPE Freq Base, RoPE Freq Scale" > "$SUMMARY_LOG" | |
# Define model path and prompt | |
MODEL_PATH="" | |
PROMPT="I believe the meaning of life is" | |
for BATCH_SIZE in "${BATCH_SIZES[@]}"; do | |
for CTX in "${CTX_SIZES[@]}"; do | |
echo "Running with Batch Size: $BATCH_SIZE, Context Size: $CTX" | |
# (Optional) If you wish to skip overly high context lengths that produce N/A results, | |
# you can uncomment the following: | |
# if [ "$CTX" -eq 131072 ]; then | |
# echo "Skipping context size $CTX (results tend to be N/A)" | |
# continue | |
# fi | |
# Remove any existing testing.txt from previous run | |
rm -f testing.txt | |
# Run llama-cli in the background with explicit RoPE settings, and capture its PID. | |
./llama-cli \ | |
-m "$MODEL_PATH" \ | |
--ctx-size "$CTX" \ | |
--batch-size "$BATCH_SIZE" \ | |
-p "$PROMPT" \ | |
--threads 24 \ | |
-fa \ | |
--seed 42 \ | |
--gpu-layers 10000 \ # just random high number, doesnt matter unless you offloading to cpu | |
--cache-type-k q4_0 \ | |
--cache-type-v q4_0 \ | |
--rope-scaling "$ROPE_SCALING" \ | |
--rope-scale "$ROPE_SCALE" \ | |
--rope-freq-base "$ROPE_FREQ_BASE" \ | |
--rope-freq-scale "$ROPE_FREQ_SCALE" \ | |
-no-cnv \ | |
--log-file testing.txt & | |
LLAMA_PID=$! | |
echo "Waiting for llama-cli (PID: $LLAMA_PID) to finish..." | |
# Track peak CPU and RAM usage while the process is running. | |
PEAK_CPU=0 | |
PEAK_RAM=0 | |
while ps -p "$LLAMA_PID" > /dev/null 2>&1; do | |
CURR_CPU=$(ps -p "$LLAMA_PID" -o %cpu --no-headers | awk '{print $1}') | |
CURR_RAM=$(ps -p "$LLAMA_PID" -o rss --no-headers | awk '{print $1}') | |
if [[ -n "$CURR_CPU" && $(echo "$CURR_CPU > $PEAK_CPU" | bc -l) -eq 1 ]]; then | |
PEAK_CPU=$CURR_CPU | |
fi | |
if [[ -n "$CURR_RAM" && $CURR_RAM -gt $PEAK_RAM ]]; then | |
PEAK_RAM=$CURR_RAM | |
fi | |
sleep 1 | |
done | |
echo "llama-cli completed." | |
# Wait until testing.txt has some data. | |
while [[ ! -s testing.txt ]]; do | |
sleep 1 | |
done | |
echo "Extracting performance metrics..." | |
# Use regex to extract the numeric value (tokens per second) from each relevant line. | |
SAMPLING_TPS=$(grep -oP 'sampling time.*?,\s*\K[\d.]+(?= tokens per second)' testing.txt | tail -n 1) | |
PROMPT_TPS=$(grep -oP 'prompt eval time.*?,\s*\K[\d.]+(?= tokens per second)' testing.txt | tail -n 1) | |
EVAL_TPS=$(grep -oP 'eval time.*?,\s*\K[\d.]+(?= tokens per second)' testing.txt | tail -n 1) | |
# If extraction failed, use "N/A" | |
[[ -z "$SAMPLING_TPS" ]] && SAMPLING_TPS="N/A" | |
[[ -z "$PROMPT_TPS" ]] && PROMPT_TPS="N/A" | |
[[ -z "$EVAL_TPS" ]] && EVAL_TPS="N/A" | |
# Convert PEAK_RAM from KB to human-readable MB/GB | |
if (( PEAK_RAM > 1048576 )); then | |
PEAK_RAM_HR="$(echo "scale=2; $PEAK_RAM/1048576" | bc) GB" | |
else | |
PEAK_RAM_HR="$(echo "scale=2; $PEAK_RAM/1024" | bc) MB" | |
fi | |
# Get peak GPU usage and VRAM via nvidia-smi. | |
PEAK_GPU=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits | sort -nr | head -n 1) | |
PEAK_VRAM=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | sort -nr | head -n 1) | |
if (( PEAK_VRAM > 10240 )); then | |
PEAK_VRAM_HR="$(echo "scale=2; $PEAK_VRAM/1024" | bc) GB" | |
else | |
PEAK_VRAM_HR="${PEAK_VRAM} MB" | |
fi | |
# Write the extracted values on a single line using printf. | |
# This line now also appends the RoPE settings. | |
printf "%s, %s, %s, %s, %s, %s%%, %s, %s%%, %s, %s, %s, %s, %s\n" \ | |
"$BATCH_SIZE" "$CTX" "$SAMPLING_TPS" "$PROMPT_TPS" "$EVAL_TPS" "$PEAK_CPU" "$PEAK_RAM_HR" "$PEAK_GPU" "$PEAK_VRAM_HR" \ | |
"$ROPE_SCALING" "$ROPE_SCALE" "$ROPE_FREQ_BASE" "$ROPE_FREQ_SCALE" >> "$SUMMARY_LOG" | |
echo "Finished test for Batch Size: $BATCH_SIZE, Context Size: $CTX" | |
echo "-----------------------------------------" | |
done | |
done | |
echo "All tests completed. Summary stored in $SUMMARY_LOG." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment