Skip to content

Instantly share code, notes, and snippets.

@ochafik

ochafik/b.sh Secret

Last active April 28, 2024 17:28
Show Gist options
  • Select an option

  • Save ochafik/5ddc619e1168ec6afbfafd50e1301d21 to your computer and use it in GitHub Desktop.

Select an option

Save ochafik/5ddc619e1168ec6afbfafd50e1301d21 to your computer and use it in GitHub Desktop.
llama.cpp hyperfine commands
#!/bin/bash
set -euo pipefail
make_args=()
cmake_args=()
# -DCMAKE_OSX_ARCHITECTURES=arm64
# -DCMAKE_BUILD_TYPE=Release
# -DCMAKE_OSX_ARCHITECTURES=x86_64;arm64
function set_flag() {
make_args+=( $1 )
cmake_args+=( -D$1 )
}
set_flag LLAMA_CURL=1
tool=$1
# Regexp match against *-openblas*
if [[ $tool =~ .*-openblas.* ]]; then
set_flag LLAMA_OPENBLAS=1
fi
if [[ $tool =~ .*-lto.* ]]; then
set_flag LLAMA_LTO=1
fi
if [[ $tool =~ .*-nonative.* ]]; then
set_flag LLAMA_NATIVE=0
fi
case "$tool" in
make*)
make clean
make -j "${make_args[@]}" main
;;
cmake*)
rm -fR build
cmake . -B build "${cmake_args[@]}"
cmake --build build -j -t main
;;
*)
echo "Unsupported tool: $tool" >&2
exit 1
;;
esac
hyperfine \
--warmup 1 --runs 5 \
-L branch grammar-speedup3,master \
--setup 'git checkout {branch} && make clean && make -j LLAMA_CURL=1 main' \
'./main \
-mu https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q4_K_M.gguf \
--grammar-file json_numbers.grammar \
-p "List of 20 integers starting from 0" \
--seed 12344'
hyperfine \
--warmup 1 --runs 5 \
-L branch agent-example,a474f50ebb3e10be3371562f75f3f573f1a86b5f \
--setup 'git checkout {branch} && make clean && make -j LLAMA_CURL=1 server' \
'python -m examples.agent \
--model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf \
--tools examples/agent/tools/example_math_tools.py \
--greedy \
--goal "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What is a third of the result?" \
--seed 12344'
hyperfine -L branch grammar-stacks-callbacks,master \
--setup 'git checkout {branch} && make clean && make -j main' \
--warmup 1 --runs 3 \
'branch={branch} \
./main -m ~/AI/Models/phi-2-super.Q5_K_M.gguf \
-j '"'"'{"items": {"type": "number"}, "minItems": 10, "maxItems": 20}'"'"' \
--seed 42 \
-p "JSON array of numbers from 0 to 15"'
echo '{"items": {"type": "number"}, "maxItems": 10000}' > schema.json && \
git checkout json-faster-repetitions2 && \
python examples/json_schema_to_grammar.py schema.json > fast.grammar && \
git checkout master && \
python examples/json-schema-to-grammar.py schema.json > slow.grammar && \
make clean && make -j LLAMA_CURL=1 main && \
mkdir -p models/7B && \
hyperfine --warmup 1 -L speed fast,slow './main -mu https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q4_K_M.gguf --grammar-file {speed}.grammar -p "List of 10 numbers" --seed 1234'
-mu https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/resolve/main/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
( export COMMON_ARGS=(
-m ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
--prompt-cache issue4218.bin
--grammar-file issue4218.gbnf
-f issue4218.txt
-c 3400
) && \
hyperfine --warmup 1 --runs 10 \
-L branch grammar-reps-revert,grammar-reps,master \
--setup "\
git checkout {branch} && \
make clean && make -j LLAMA_CURL=1 main && \
rm -f issue4218.bin && \
./main ${COMMON_ARGS[*]} -n 1" \
"BRANCH={branch} \
./main ${COMMON_ARGS[*]} -n 256 --prompt-cache-ro --seed 12345 --no-display-prompt" \
)
( export COMMON_ARGS=(
-m ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
--prompt-cache issue4218.bin
--grammar-file issue4218.gbnf
-f issue4218.txt
-c 3400
) && \
hyperfine --warmup 0 --runs 3 \
-L branch grammar-fast,master \
--setup "\
git checkout {branch} && \
make clean && make -j LLAMA_CURL=1 main && \
rm -f issue4218.bin && \
./main ${COMMON_ARGS[*]} -n 1" \
"BRANCH={branch} \
./main ${COMMON_ARGS[*]} -n 256 --prompt-cache-ro --seed 12345 --no-display-prompt" \
)
hyperfine --warmup 1 --runs 3 \
-L branch grammar-fast,master \
--setup 'git checkout {branch} && \
make clean && \
make -j LLAMA_CURL=1 main' \
'BRANCH={branch} \
./main \
-m ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf \
-j '"'"'{"items": {"type": "number"}, "minItems": 10, "maxItems": 100}'"'"' \
-p "JSON list of first 50 integers" \
--seed 12345 --no-display-prompt'
make LLAMA_CURL=1 LLAMA_NATIVE=1 LLAMA_OPENBLAS=1 main
LLAMA_BLIS
./main -m ~/AI/Models/Meta-Llama-3-70B-Instruct-Q5_K_M.gguf -p "What can you tell me about the latest quantum physics advances?" -ctk q4_0
( export modes=(
"LLAMA_NO_ACCELERATE=1"
"LLAMA_NATIVE=1"
"LLAMA_NATIVE=1 LLAMA_NO_ACCELERATE=1"
"LLAMA_NATIVE=1 LLAMA_OPENBLAS=1"
"LLAMA_NATIVE=1 LLAMA_NO_LLAMAFILE="
"LLAMA_NATIVE=1 LLAMA_NO_LLAMAFILE="
"LLAMA_NATIVE=1 LLAMA_BLIS=1"
)
hyperfine -L mode "LLAMA_OPENBLAS=1 LLAMA_NATIVE=1,LLAMA_NATIVE=1,,LLAMA_NO_LLAMAFILE=,LLAMA_NO_LLAMAFILE= LLAMA_NATIVE=1,LLAMA_BLIS=1 LLAMA_NATIVE=1" \
( export params=(
"LLAMA_ACCELERATE={accelerate}"
"LLAMA_NATIVE={native}"
"LLAMA_OPENBLAS={openblas}"
"LLAMA_BLIS={blis}"
"LLAMA_NO_LLAMAFILE={no_llamafile}"
) && \
hyperfine \
-L accelerate 0,1 \
-L native 0,1 \
-L openblas 0,1 \
-L blis 0,1 \
-L no_llamafile ,1 \
--warmup 1 --runs 5 \
--setup "make clean && make -j LLAMA_CURL=1 ${params[*]} main" \
--export-json perf.json \
--export-markdown perf.md \
"${params[*]} ./main -m ~/AI/Models/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf -n 128 --seed 42 -p 'What can you tell me about the latest quantum physics advances? Do not shy away from the details, tell me everyting one may or may not want to ever know regardless of their background, level and tastes.'" )
- Reps: grammar-reps vs. master
- Deduping: 04a5ac211ef40936295980b7cdf0ba6e97093146 (previous commit = f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7)
- Reserves: cbaadc92942c50aab599a9e4c163afc1f44f7c26 (previous commit = 1bbdaf6ecda6f0a360dfb307b256fcb6838c560b)
-L branch grammar-reps,master,04a5ac211ef40936295980b7cdf0ba6e97093146,f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7,cbaadc92942c50aab599a9e4c163afc1f44f7c26,1bbdaf6ecda6f0a360dfb307b256fcb6838c560b \
( export MODEL_URL=https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/resolve/main/Hermes-2-Pro-Mistral-7B.Q8_0.gguf && \
export PROMPT_FILE=issue4218.txt && \
hyperfine --warmup 1 --runs 10 \
-L branch grammar-reps,master \
--setup 'git checkout {branch} && \
make clean && make -j LLAMA_CURL=1 main && \
rm -f issue4218.bin && \
./main -mu $MODEL_URL \
-c 3400 \
--prompt-cache issue4218.bin \
-n 1' \
'BRANCH={branch} ./main -mu $MODEL_URL \
-c 3400 \
--prompt-cache issue4218.bin \
--prompt-cache-ro \
--no-display-prompt \
--seed 12345 \
-n 256' \
)
-m /Users/ochafik/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
@HanClinto I did a first bit of benchmarking / comparison of outputs on @AlienKevin's grammar from https://github.com/ggerganov/llama.cpp/issues/4218#issuecomment-1836540046 on this PR and it seems to be ~~~8%~~ (**edit**) 5% slower than master on my Mac.
Here's the instructions (download [issue4218.gbnf](https://gist.githubusercontent.com/ochafik/e5b1731e4e574ac1aed2e49e5f9c5899/raw/c0c1411377500f6e0d66ec7dbfea22686178986c/issue4218.gbnf) and [issue4218.txt](https://gist.githubusercontent.com/ochafik/e5b1731e4e574ac1aed2e49e5f9c5899/raw/c0c1411377500f6e0d66ec7dbfea22686178986c/issue4218.txt)):
```bash
( export COMMON_ARGS=(
-mu https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/resolve/main/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
--prompt-cache issue4218.bin
--grammar-file issue4218.gbnf
-f issue4218.txt
-c 3400
) && \
hyperfine --warmup 1 --runs 10 \
-L branch grammar-reps,master \
--setup "\
git checkout {branch} && \
make clean && make -j LLAMA_CURL=1 main && \
rm -f issue4218.bin && \
./main ${COMMON_ARGS[*]} -n 1" \
"BRANCH={branch} \
./main ${COMMON_ARGS[*]} -n 256 --prompt-cache-ro --seed 12345 --no-display-prompt" \
)
```
(used a prompt cache to benchmark mostly the generation itself and not the long prompt eval; grammar, prompt and script in [this gist](https://gist.github.com/ochafik/e5b1731e4e574ac1aed2e49e5f9c5899)).
(Interestingly, that same grammar makes master to segfault with sequences that are a bit too long, will probably investigate / report separately ; **edit** seems to be caused by `-ctk q4_k` regardless of grammar --> https://github.com/ggerganov/llama.cpp/issues/5652)
hyperfine --warmup 0 --runs 3 -L openblas 1, -L nollamafile ,1 -L tcmalloc 1, \
--setup 'make clean && make LLAMA_CURL=1 LLAMA_OPENBLAS={openblas} LLAMA_NO_LLAMAFILE={nollamafile} LLAMA_TCMALLOC={tcmalloc} -j main' 'ENV="openblas={openblas} nollamafile={nollamafile} tcmalloc={tcmalloc}" ./main -m ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf --grammar-file issue4218.gbnf -f issue4218.txt -c 3400 -n 128 --seed 12345 --no-display-prompt --log-disable'
hyperfine --warmup 1 --runs 5 \
-L branch grammar-fast,master \
--setup 'git checkout {branch} && \
make clean && \
make -j LLAMA_CURL=1 main' \
'BRANCH={branch} \
./main \
-m models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf \
-j '"'"'{"items": {"type": "number"}, "minItems": 10, "maxItems": 100}'"'"' \
-p "JSON list of 50 integers starting from 100000" \
--seed 12345 --no-display-prompt'
hyperfine --warmup 0 --runs 3 -L tool cmake-opt,cmake,make,make-openblas \
--setup './b.sh {tool}' \
'./r.sh {tool} \
-mu https://huggingface.co/MoMonir/Phi-3-mini-128k-instruct-GGUF/resolve/main/phi-3-mini-128k-instruct.Q8_0.gguf -p \
-f /Users/ochafik/github/ai/indexer/paul_graham_essay.txt \
-c 131072 -n 1 --seed 12345 --no-display-prompt --log-disable'
cat /Users/ochafik/github/ai/indexer/paul_graham_essay.txt | head -n 100 > input.txt
( export COMMON_ARGS=(
-mu https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/resolve/main/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
-m models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
--prompt-cache issue4218.bin
--grammar-file issue4218.gbnf
-f issue4218.txt
-c 3400
) && \
hyperfine --warmup 1 --runs 5 \
-L branch grammar-fast,master \
--setup "\
git checkout {branch} && \
make clean && make -j LLAMA_CURL=1 main && \
rm -f issue4218.bin && \
./main ${COMMON_ARGS[*]} -n 1" \
"BRANCH={branch} \
./main ${COMMON_ARGS[*]} -n 128 --prompt-cache-ro --seed 12345 --no-display-prompt" )
hyperfine --warmup 0 --runs 3 -L tool cmake-nonative,cmake-lto,cmake-lto-nonative,cmake,make,make-nonative,make-openblas,make-openblas-nonative \
--setup './b.sh {tool}' \
'./r.sh {tool} \
-mu https://huggingface.co/MoMonir/Phi-3-mini-128k-instruct-GGUF/resolve/main/phi-3-mini-128k-instruct.Q8_0.gguf \
-f input.txt \
-c 131072 -n 100 --seed 12345 --no-display-prompt --log-disable'
cat ../ai/indexer/paul_graham_essay.txt| head -n 100 > ../input.txt
hyperfine --warmup 1 --runs 3 -L branch gg/flash-attn,master \
--setup 'git checkout {branch} && make clean && make -j LLAMA_CURL=1 main' \
'branch={branch} ./main \
-m models/Meta-Llama-3-70B-Instruct-Q4_K_M.gguf \
-f ../input.txt \
-c 131072 -n 100 --seed 12345 --no-display-prompt --log-disable -ctk q4_0'
#!/bin/bash
set -euo pipefail
tool=$1
shift 1
case "$tool" in
make*)
./main "$@"
;;
cmake*)
./build/bin/main "$@"
;;
*)
echo "Unsupported tool: $tool" >&2
exit 1
esac
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment