P3GLEG · February 10, 2025 09:45
diff --git a/gistfile1.txt b/gistfile1.txt
 #!/bin/bash

 # Variables to iterate for
 BATCH_SIZES=(512 1024 2048 4096)
 CTX_SIZES=(4096 8192 16384 32768 65536 131072)
 ROPE_SCALINGS=("linear" "yarn")
 ROPE_SCALES=(1 1.5 2)
 ROPE_FREQ_BASE=1000000
 ROPE_FREQ_SCALE=1

 # Define the log file to store summary results.
 # The CSV now also includes the RoPE settings.
 SUMMARY_LOG="summary_results.txt"
 echo "Batch Size, Context Size, Sampling TPS, Prompt Eval TPS, Token Eval TPS, Peak CPU (%), Peak RAM, Peak GPU (%), Peak VRAM, RoPE Scaling, RoPE Scale, RoPE Freq Base, RoPE Freq Scale" > "$SUMMARY_LOG"

 # Define model path and prompt
 MODEL_PATH=""
 PROMPT="I believe the meaning of life is"

 for BATCH_SIZE in "${BATCH_SIZES[@]}"; do
  for CTX in "${CTX_SIZES[@]}"; do
    echo "Running with Batch Size: $BATCH_SIZE, Context Size: $CTX"

    # (Optional) If you wish to skip overly high context lengths that produce N/A results,
    # you can uncomment the following:
    # if [ "$CTX" -eq 131072 ]; then
    #   echo "Skipping context size $CTX (results tend to be N/A)"
    #   continue
    # fi

    # Remove any existing testing.txt from previous run
    rm -f testing.txt

    # Run llama-cli in the background with explicit RoPE settings, and capture its PID.
    ./llama-cli \
      -m "$MODEL_PATH" \
      --ctx-size "$CTX" \
      --batch-size "$BATCH_SIZE" \
      -p "$PROMPT" \
      --threads 24 \
      -fa \
      --seed 42 \
      --gpu-layers 10000 \ # just random high number, doesnt matter unless you offloading to cpu
      --cache-type-k q4_0 \
      --cache-type-v q4_0 \
      --rope-scaling "$ROPE_SCALING" \
      --rope-scale "$ROPE_SCALE" \
      --rope-freq-base "$ROPE_FREQ_BASE" \
      --rope-freq-scale "$ROPE_FREQ_SCALE" \
      -no-cnv \
      --log-file testing.txt &
    LLAMA_PID=$!

    echo "Waiting for llama-cli (PID: $LLAMA_PID) to finish..."

    # Track peak CPU and RAM usage while the process is running.
    PEAK_CPU=0
    PEAK_RAM=0
    while ps -p "$LLAMA_PID" > /dev/null 2>&1; do
      CURR_CPU=$(ps -p "$LLAMA_PID" -o %cpu --no-headers | awk '{print $1}')
      CURR_RAM=$(ps -p "$LLAMA_PID" -o rss --no-headers | awk '{print $1}')
      if [[ -n "$CURR_CPU" && $(echo "$CURR_CPU > $PEAK_CPU" | bc -l) -eq 1 ]]; then
        PEAK_CPU=$CURR_CPU
      fi
      if [[ -n "$CURR_RAM" && $CURR_RAM -gt $PEAK_RAM ]]; then
        PEAK_RAM=$CURR_RAM
      fi
      sleep 1
    done

    echo "llama-cli completed."

    # Wait until testing.txt has some data.
    while [[ ! -s testing.txt ]]; do
      sleep 1
    done

    echo "Extracting performance metrics..."

    # Use regex to extract the numeric value (tokens per second) from each relevant line.
    SAMPLING_TPS=$(grep -oP 'sampling time.*?,\s*\K[\d.]+(?= tokens per second)' testing.txt | tail -n 1)
    PROMPT_TPS=$(grep -oP 'prompt eval time.*?,\s*\K[\d.]+(?= tokens per second)' testing.txt | tail -n 1)
    EVAL_TPS=$(grep -oP 'eval time.*?,\s*\K[\d.]+(?= tokens per second)' testing.txt | tail -n 1)

    # If extraction failed, use "N/A"
    [[ -z "$SAMPLING_TPS" ]] && SAMPLING_TPS="N/A"
    [[ -z "$PROMPT_TPS" ]] && PROMPT_TPS="N/A"
    [[ -z "$EVAL_TPS" ]] && EVAL_TPS="N/A"

    # Convert PEAK_RAM from KB to human-readable MB/GB
    if (( PEAK_RAM > 1048576 )); then
      PEAK_RAM_HR="$(echo "scale=2; $PEAK_RAM/1048576" | bc) GB"
    else
      PEAK_RAM_HR="$(echo "scale=2; $PEAK_RAM/1024" | bc) MB"
    fi

    # Get peak GPU usage and VRAM via nvidia-smi.
    PEAK_GPU=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits | sort -nr | head -n 1)
    PEAK_VRAM=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | sort -nr | head -n 1)
    if (( PEAK_VRAM > 10240 )); then
      PEAK_VRAM_HR="$(echo "scale=2; $PEAK_VRAM/1024" | bc) GB"
    else
      PEAK_VRAM_HR="${PEAK_VRAM} MB"
    fi

    # Write the extracted values on a single line using printf.
    # This line now also appends the RoPE settings.
    printf "%s, %s, %s, %s, %s, %s%%, %s, %s%%, %s, %s, %s, %s, %s\n" \
      "$BATCH_SIZE" "$CTX" "$SAMPLING_TPS" "$PROMPT_TPS" "$EVAL_TPS" "$PEAK_CPU" "$PEAK_RAM_HR" "$PEAK_GPU" "$PEAK_VRAM_HR" \
      "$ROPE_SCALING" "$ROPE_SCALE" "$ROPE_FREQ_BASE" "$ROPE_FREQ_SCALE" >> "$SUMMARY_LOG"

    echo "Finished test for Batch Size: $BATCH_SIZE, Context Size: $CTX"
    echo "-----------------------------------------"
  done
 done

 echo "All tests completed. Summary stored in $SUMMARY_LOG."
	#!/bin/bash

	# Variables to iterate for
	BATCH_SIZES=(512 1024 2048 4096)
	CTX_SIZES=(4096 8192 16384 32768 65536 131072)
	ROPE_SCALINGS=("linear" "yarn")
	ROPE_SCALES=(1 1.5 2)
	ROPE_FREQ_BASE=1000000
	ROPE_FREQ_SCALE=1

	# Define the log file to store summary results.
	# The CSV now also includes the RoPE settings.
	SUMMARY_LOG="summary_results.txt"
	echo "Batch Size, Context Size, Sampling TPS, Prompt Eval TPS, Token Eval TPS, Peak CPU (%), Peak RAM, Peak GPU (%), Peak VRAM, RoPE Scaling, RoPE Scale, RoPE Freq Base, RoPE Freq Scale" > "$SUMMARY_LOG"

	# Define model path and prompt
	MODEL_PATH=""
	PROMPT="I believe the meaning of life is"

	for BATCH_SIZE in "${BATCH_SIZES[@]}"; do
	for CTX in "${CTX_SIZES[@]}"; do
	echo "Running with Batch Size: $BATCH_SIZE, Context Size: $CTX"

	# (Optional) If you wish to skip overly high context lengths that produce N/A results,
	# you can uncomment the following:
	# if [ "$CTX" -eq 131072 ]; then
	# echo "Skipping context size $CTX (results tend to be N/A)"
	# continue
	# fi

	# Remove any existing testing.txt from previous run
	rm -f testing.txt

	# Run llama-cli in the background with explicit RoPE settings, and capture its PID.
	./llama-cli \
	-m "$MODEL_PATH" \
	--ctx-size "$CTX" \
	--batch-size "$BATCH_SIZE" \
	-p "$PROMPT" \
	--threads 24 \
	-fa \
	--seed 42 \
	--gpu-layers 10000 \ # just random high number, doesnt matter unless you offloading to cpu
	--cache-type-k q4_0 \
	--cache-type-v q4_0 \
	--rope-scaling "$ROPE_SCALING" \
	--rope-scale "$ROPE_SCALE" \
	--rope-freq-base "$ROPE_FREQ_BASE" \
	--rope-freq-scale "$ROPE_FREQ_SCALE" \
	-no-cnv \
	--log-file testing.txt &
	LLAMA_PID=$!

	echo "Waiting for llama-cli (PID: $LLAMA_PID) to finish..."

	# Track peak CPU and RAM usage while the process is running.
	PEAK_CPU=0
	PEAK_RAM=0
	while ps -p "$LLAMA_PID" > /dev/null 2>&1; do
	CURR_CPU=$(ps -p "$LLAMA_PID" -o %cpu --no-headers \| awk '{print $1}')
	CURR_RAM=$(ps -p "$LLAMA_PID" -o rss --no-headers \| awk '{print $1}')
	if [[ -n "$CURR_CPU" && $(echo "$CURR_CPU > $PEAK_CPU" \| bc -l) -eq 1 ]]; then
	PEAK_CPU=$CURR_CPU
	fi
	if [[ -n "$CURR_RAM" && $CURR_RAM -gt $PEAK_RAM ]]; then
	PEAK_RAM=$CURR_RAM
	fi
	sleep 1
	done

	echo "llama-cli completed."

	# Wait until testing.txt has some data.
	while [[ ! -s testing.txt ]]; do
	sleep 1
	done

	echo "Extracting performance metrics..."

	# Use regex to extract the numeric value (tokens per second) from each relevant line.
	SAMPLING_TPS=$(grep -oP 'sampling time.?,\s\K[\d.]+(?= tokens per second)' testing.txt \| tail -n 1)
	PROMPT_TPS=$(grep -oP 'prompt eval time.?,\s\K[\d.]+(?= tokens per second)' testing.txt \| tail -n 1)
	EVAL_TPS=$(grep -oP 'eval time.?,\s\K[\d.]+(?= tokens per second)' testing.txt \| tail -n 1)

	# If extraction failed, use "N/A"
	[[ -z "$SAMPLING_TPS" ]] && SAMPLING_TPS="N/A"
	[[ -z "$PROMPT_TPS" ]] && PROMPT_TPS="N/A"
	[[ -z "$EVAL_TPS" ]] && EVAL_TPS="N/A"

	# Convert PEAK_RAM from KB to human-readable MB/GB
	if (( PEAK_RAM > 1048576 )); then
	PEAK_RAM_HR="$(echo "scale=2; $PEAK_RAM/1048576" \| bc) GB"
	else
	PEAK_RAM_HR="$(echo "scale=2; $PEAK_RAM/1024" \| bc) MB"
	fi

	# Get peak GPU usage and VRAM via nvidia-smi.
	PEAK_GPU=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits \| sort -nr \| head -n 1)
	PEAK_VRAM=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits \| sort -nr \| head -n 1)
	if (( PEAK_VRAM > 10240 )); then
	PEAK_VRAM_HR="$(echo "scale=2; $PEAK_VRAM/1024" \| bc) GB"
	else
	PEAK_VRAM_HR="${PEAK_VRAM} MB"
	fi

	# Write the extracted values on a single line using printf.
	# This line now also appends the RoPE settings.
	printf "%s, %s, %s, %s, %s, %s%%, %s, %s%%, %s, %s, %s, %s, %s\n" \
	"$BATCH_SIZE" "$CTX" "$SAMPLING_TPS" "$PROMPT_TPS" "$EVAL_TPS" "$PEAK_CPU" "$PEAK_RAM_HR" "$PEAK_GPU" "$PEAK_VRAM_HR" \
	"$ROPE_SCALING" "$ROPE_SCALE" "$ROPE_FREQ_BASE" "$ROPE_FREQ_SCALE" >> "$SUMMARY_LOG"

	echo "Finished test for Batch Size: $BATCH_SIZE, Context Size: $CTX"
	echo "-----------------------------------------"
	done
	done

	echo "All tests completed. Summary stored in $SUMMARY_LOG."