ochafik · April 28, 2024 17:28
diff --git a/b.sh b/b.sh
 #!/bin/bash
 set -euo pipefail

 make_args=()
 cmake_args=()
 # -DCMAKE_OSX_ARCHITECTURES=arm64
 # -DCMAKE_BUILD_TYPE=Release
 # -DCMAKE_OSX_ARCHITECTURES=x86_64;arm64

 function set_flag() {
  make_args+=( $1 )
  cmake_args+=( -D$1 )
 }
 set_flag LLAMA_CURL=1

 tool=$1

 # Regexp match against *-openblas*
 if [[ $tool =~ .*-openblas.* ]]; then
  set_flag LLAMA_OPENBLAS=1
 fi

 if [[ $tool =~ .*-lto.* ]]; then
  set_flag LLAMA_LTO=1
 fi

 if [[ $tool =~ .*-nonative.* ]]; then
  set_flag LLAMA_NATIVE=0
 fi

 case "$tool" in
  make*)
    make clean
    make -j "${make_args[@]}" main
    ;;
  cmake*)
    rm -fR build
    cmake . -B build "${cmake_args[@]}"
    cmake --build build -j -t main
    ;;
  *)
    echo "Unsupported tool: $tool" >&2
    exit 1
    ;;
 esac
diff --git a/llama.cpp-hyperfine.sh b/llama.cpp-hyperfine.sh
 hyperfine \
    --warmup 1 --runs 5 \
    -L branch grammar-speedup3,master \
    --setup 'git checkout {branch} && make clean && make -j LLAMA_CURL=1 main' \
    './main \
        -mu https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q4_K_M.gguf \
        --grammar-file json_numbers.grammar \
        -p "List of 20 integers starting from 0" \
        --seed 12344'

 hyperfine \
    --warmup 1 --runs 5 \
    -L branch agent-example,a474f50ebb3e10be3371562f75f3f573f1a86b5f \
    --setup 'git checkout {branch} && make clean && make -j LLAMA_CURL=1 server' \
    'python -m examples.agent \
        --model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf \
        --tools examples/agent/tools/example_math_tools.py \
        --greedy \
        --goal "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What is a third of the result?" \
        --seed 12344'


 hyperfine -L branch grammar-stacks-callbacks,master \
  --setup 'git checkout {branch} && make clean && make -j main' \
  --warmup 1 --runs 3 \
  'branch={branch} \
    ./main -m ~/AI/Models/phi-2-super.Q5_K_M.gguf \
    -j '"'"'{"items": {"type": "number"}, "minItems": 10, "maxItems": 20}'"'"' \
    --seed 42 \
    -p "JSON array of numbers from 0 to 15"'

 echo '{"items": {"type": "number"}, "maxItems": 10000}' > schema.json && \
  git checkout json-faster-repetitions2 && \
  python examples/json_schema_to_grammar.py schema.json > fast.grammar && \
  git checkout master && \
  python examples/json-schema-to-grammar.py schema.json > slow.grammar && \
  make clean && make -j LLAMA_CURL=1 main && \
  mkdir -p models/7B && \
  hyperfine --warmup 1 -L speed fast,slow './main -mu https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q4_K_M.gguf --grammar-file {speed}.grammar -p "List of 10 numbers" --seed 1234'


    -mu https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/resolve/main/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
 ( export COMMON_ARGS=(
    -m ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf 
    --prompt-cache issue4218.bin
    --grammar-file issue4218.gbnf
    -f issue4218.txt
    -c 3400
  ) && \
  hyperfine --warmup 1 --runs 10 \
    -L branch grammar-reps-revert,grammar-reps,master \
    --setup "\
      git checkout {branch} && \
      make clean && make -j LLAMA_CURL=1 main && \
      rm -f issue4218.bin && \
      ./main ${COMMON_ARGS[*]} -n 1" \
    "BRANCH={branch} \
      ./main ${COMMON_ARGS[*]} -n 256 --prompt-cache-ro --seed 12345 --no-display-prompt" \
 )

 ( export COMMON_ARGS=(
    -m ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf 
    --prompt-cache issue4218.bin
    --grammar-file issue4218.gbnf
    -f issue4218.txt
    -c 3400
  ) && \
  hyperfine --warmup 0 --runs 3 \
    -L branch grammar-fast,master \
    --setup "\
      git checkout {branch} && \
      make clean && make -j LLAMA_CURL=1 main && \
      rm -f issue4218.bin && \
      ./main ${COMMON_ARGS[*]} -n 1" \
    "BRANCH={branch} \
      ./main ${COMMON_ARGS[*]} -n 256 --prompt-cache-ro --seed 12345 --no-display-prompt" \
 )


 hyperfine --warmup 1 --runs 3 \
    -L branch grammar-fast,master \
    --setup 'git checkout {branch} && \
             make clean && \
             make -j LLAMA_CURL=1 main' \
    'BRANCH={branch} \
        ./main \
            -m ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf \
            -j '"'"'{"items": {"type": "number"}, "minItems": 10, "maxItems": 100}'"'"' \
            -p "JSON list of first 50 integers" \
            --seed 12345 --no-display-prompt'


 make LLAMA_CURL=1 LLAMA_NATIVE=1 LLAMA_OPENBLAS=1 main
 LLAMA_BLIS
 ./main -m ~/AI/Models/Meta-Llama-3-70B-Instruct-Q5_K_M.gguf -p "What can you tell me about the latest quantum physics advances?" -ctk q4_0

 ( export modes=(
    "LLAMA_NO_ACCELERATE=1"
    "LLAMA_NATIVE=1"
    "LLAMA_NATIVE=1 LLAMA_NO_ACCELERATE=1"
    "LLAMA_NATIVE=1 LLAMA_OPENBLAS=1"
    "LLAMA_NATIVE=1 LLAMA_NO_LLAMAFILE="
    "LLAMA_NATIVE=1 LLAMA_NO_LLAMAFILE="
    "LLAMA_NATIVE=1 LLAMA_BLIS=1"
 )
 hyperfine -L mode "LLAMA_OPENBLAS=1 LLAMA_NATIVE=1,LLAMA_NATIVE=1,,LLAMA_NO_LLAMAFILE=,LLAMA_NO_LLAMAFILE= LLAMA_NATIVE=1,LLAMA_BLIS=1 LLAMA_NATIVE=1" \

 ( export params=(
    "LLAMA_ACCELERATE={accelerate}"
    "LLAMA_NATIVE={native}"
    "LLAMA_OPENBLAS={openblas}"
    "LLAMA_BLIS={blis}"
    "LLAMA_NO_LLAMAFILE={no_llamafile}"
  ) && \
  hyperfine \
    -L accelerate 0,1 \
    -L native 0,1 \
    -L openblas 0,1 \
    -L blis 0,1 \
    -L no_llamafile ,1 \
    --warmup 1 --runs 5 \
    --setup "make clean && make -j LLAMA_CURL=1 ${params[*]} main" \
    --export-json perf.json \
    --export-markdown perf.md \
    "${params[*]} ./main -m ~/AI/Models/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf -n 128 --seed 42 -p 'What can you tell me about the latest quantum physics advances? Do not shy away from the details, tell me everyting one may or may not want to ever know regardless of their background, level and tastes.'" )


 - Reps: grammar-reps vs. master
 - Deduping: 04a5ac211ef40936295980b7cdf0ba6e97093146 (previous commit = f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7)
 - Reserves: cbaadc92942c50aab599a9e4c163afc1f44f7c26 (previous commit = 1bbdaf6ecda6f0a360dfb307b256fcb6838c560b)
 -L branch grammar-reps,master,04a5ac211ef40936295980b7cdf0ba6e97093146,f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7,cbaadc92942c50aab599a9e4c163afc1f44f7c26,1bbdaf6ecda6f0a360dfb307b256fcb6838c560b \

 ( export MODEL_URL=https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/resolve/main/Hermes-2-Pro-Mistral-7B.Q8_0.gguf && \
  export PROMPT_FILE=issue4218.txt && \
  hyperfine --warmup 1 --runs 10 \
    -L branch grammar-reps,master \
    --setup 'git checkout {branch} && \
              make clean && make -j LLAMA_CURL=1 main && \
              rm -f issue4218.bin && \
              ./main -mu $MODEL_URL \
                  -c 3400 \
                  --prompt-cache issue4218.bin \
                  -n 1' \
    'BRANCH={branch} ./main -mu $MODEL_URL \
        -c 3400 \
        --prompt-cache issue4218.bin \
        --prompt-cache-ro \
        --no-display-prompt \
        --seed 12345 \
        -n 256' \
 )



    -m /Users/ochafik/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
 @HanClinto I did a first bit of benchmarking / comparison of outputs on @AlienKevin's grammar from https://github.com/ggerganov/llama.cpp/issues/4218#issuecomment-1836540046 on this PR and it seems to be ~~~8%~~ (**edit**) 5% slower than master on my Mac.

 Here's the instructions (download [issue4218.gbnf](https://gist.githubusercontent.com/ochafik/e5b1731e4e574ac1aed2e49e5f9c5899/raw/c0c1411377500f6e0d66ec7dbfea22686178986c/issue4218.gbnf) and [issue4218.txt](https://gist.githubusercontent.com/ochafik/e5b1731e4e574ac1aed2e49e5f9c5899/raw/c0c1411377500f6e0d66ec7dbfea22686178986c/issue4218.txt)):

 ```bash
 ( export COMMON_ARGS=(
    -mu https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/resolve/main/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
    --prompt-cache issue4218.bin
    --grammar-file issue4218.gbnf
    -f issue4218.txt
    -c 3400
  ) && \
  hyperfine --warmup 1 --runs 10 \
    -L branch grammar-reps,master \
    --setup "\
      git checkout {branch} && \
      make clean && make -j LLAMA_CURL=1 main && \
      rm -f issue4218.bin && \
      ./main ${COMMON_ARGS[*]} -n 1" \
    "BRANCH={branch} \
      ./main ${COMMON_ARGS[*]} -n 256 --prompt-cache-ro --seed 12345 --no-display-prompt" \
 )
 ```

 (used a prompt cache to benchmark mostly the generation itself and not the long prompt eval; grammar, prompt and script in [this gist](https://gist.github.com/ochafik/e5b1731e4e574ac1aed2e49e5f9c5899)).

 (Interestingly, that same grammar makes master to segfault with sequences that are a bit too long, will probably investigate / report separately ; **edit** seems to be caused by `-ctk q4_k` regardless of grammar --> https://github.com/ggerganov/llama.cpp/issues/5652)



 hyperfine --warmup 0 --runs 3 -L openblas 1, -L nollamafile ,1 -L tcmalloc 1, \
  --setup 'make clean && make LLAMA_CURL=1 LLAMA_OPENBLAS={openblas} LLAMA_NO_LLAMAFILE={nollamafile} LLAMA_TCMALLOC={tcmalloc} -j main' 'ENV="openblas={openblas} nollamafile={nollamafile} tcmalloc={tcmalloc}" ./main -m ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf --grammar-file issue4218.gbnf -f issue4218.txt -c 3400 -n 128 --seed 12345 --no-display-prompt --log-disable'

 hyperfine --warmup 1 --runs 5 \
    -L branch grammar-fast,master \
    --setup 'git checkout {branch} && \
             make clean && \
             make -j LLAMA_CURL=1 main' \
    'BRANCH={branch} \
        ./main \
            -m models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf \
            -j '"'"'{"items": {"type": "number"}, "minItems": 10, "maxItems": 100}'"'"' \
            -p "JSON list of 50 integers starting from 100000" \
            --seed 12345 --no-display-prompt'


 hyperfine --warmup 0 --runs 3 -L tool cmake-opt,cmake,make,make-openblas \
  --setup './b.sh {tool}' \
  './r.sh {tool} \
    -mu https://huggingface.co/MoMonir/Phi-3-mini-128k-instruct-GGUF/resolve/main/phi-3-mini-128k-instruct.Q8_0.gguf -p \
    -f /Users/ochafik/github/ai/indexer/paul_graham_essay.txt \
    -c 131072 -n 1 --seed 12345 --no-display-prompt --log-disable'

 cat /Users/ochafik/github/ai/indexer/paul_graham_essay.txt | head -n 100 > input.txt

 ( export COMMON_ARGS=(
    -mu https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/resolve/main/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
    -m models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
    --prompt-cache issue4218.bin
    --grammar-file issue4218.gbnf
    -f issue4218.txt
    -c 3400
  ) && \
  hyperfine --warmup 1 --runs 5 \
    -L branch grammar-fast,master \
    --setup "\
      git checkout {branch} && \
      make clean && make -j LLAMA_CURL=1 main && \
      rm -f issue4218.bin && \
      ./main ${COMMON_ARGS[*]} -n 1" \
    "BRANCH={branch} \
      ./main ${COMMON_ARGS[*]} -n 128 --prompt-cache-ro --seed 12345 --no-display-prompt" )

      
 hyperfine --warmup 0 --runs 3 -L tool cmake-nonative,cmake-lto,cmake-lto-nonative,cmake,make,make-nonative,make-openblas,make-openblas-nonative \
  --setup './b.sh {tool}' \
  './r.sh {tool} \
    -mu https://huggingface.co/MoMonir/Phi-3-mini-128k-instruct-GGUF/resolve/main/phi-3-mini-128k-instruct.Q8_0.gguf \
    -f input.txt \
    -c 131072 -n 100 --seed 12345 --no-display-prompt --log-disable'

 cat ../ai/indexer/paul_graham_essay.txt| head -n 100 > ../input.txt
 hyperfine --warmup 1 --runs 3 -L branch gg/flash-attn,master \
  --setup 'git checkout {branch} && make clean && make -j LLAMA_CURL=1 main' \
  'branch={branch} ./main \
    -m models/Meta-Llama-3-70B-Instruct-Q4_K_M.gguf \
    -f ../input.txt \
    -c 131072 -n 100 --seed 12345 --no-display-prompt --log-disable -ctk q4_0'
diff --git a/r.sh b/r.sh
 #!/bin/bash
 set -euo pipefail

 tool=$1
 shift 1

 case "$tool" in
  make*)
    ./main "$@"
    ;;
  cmake*)
    ./build/bin/main "$@"
    ;;
  *)
    echo "Unsupported tool: $tool" >&2
    exit 1
 esac
	#!/bin/bash
	set -euo pipefail

	make_args=()
	cmake_args=()
	# -DCMAKE_OSX_ARCHITECTURES=arm64
	# -DCMAKE_BUILD_TYPE=Release
	# -DCMAKE_OSX_ARCHITECTURES=x86_64;arm64

	function set_flag() {
	make_args+=( $1 )
	cmake_args+=( -D$1 )
	}
	set_flag LLAMA_CURL=1

	tool=$1

	# Regexp match against -openblas
	if [[ $tool =~ .-openblas. ]]; then
	set_flag LLAMA_OPENBLAS=1
	fi

	if [[ $tool =~ .-lto. ]]; then
	set_flag LLAMA_LTO=1
	fi

	if [[ $tool =~ .-nonative. ]]; then
	set_flag LLAMA_NATIVE=0
	fi

	case "$tool" in
	make*)
	make clean
	make -j "${make_args[@]}" main
	;;
	cmake*)
	rm -fR build
	cmake . -B build "${cmake_args[@]}"
	cmake --build build -j -t main
	;;
	*)
	echo "Unsupported tool: $tool" >&2
	exit 1
	;;
	esac
	hyperfine \
	--warmup 1 --runs 5 \
	-L branch grammar-speedup3,master \
	--setup 'git checkout {branch} && make clean && make -j LLAMA_CURL=1 main' \
	'./main \
	-mu https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q4_K_M.gguf \
	--grammar-file json_numbers.grammar \
	-p "List of 20 integers starting from 0" \
	--seed 12344'

	hyperfine \
	--warmup 1 --runs 5 \
	-L branch agent-example,a474f50ebb3e10be3371562f75f3f573f1a86b5f \
	--setup 'git checkout {branch} && make clean && make -j LLAMA_CURL=1 server' \
	'python -m examples.agent \
	--model ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q8_0.gguf \
	--tools examples/agent/tools/example_math_tools.py \
	--greedy \
	--goal "What is the sum of 2535 squared and 32222000403 then multiplied by one and a half. What is a third of the result?" \
	--seed 12344'


	hyperfine -L branch grammar-stacks-callbacks,master \
	--setup 'git checkout {branch} && make clean && make -j main' \
	--warmup 1 --runs 3 \
	'branch={branch} \
	./main -m ~/AI/Models/phi-2-super.Q5_K_M.gguf \
	-j '"'"'{"items": {"type": "number"}, "minItems": 10, "maxItems": 20}'"'"' \
	--seed 42 \
	-p "JSON array of numbers from 0 to 15"'

	echo '{"items": {"type": "number"}, "maxItems": 10000}' > schema.json && \
	git checkout json-faster-repetitions2 && \
	python examples/json_schema_to_grammar.py schema.json > fast.grammar && \
	git checkout master && \
	python examples/json-schema-to-grammar.py schema.json > slow.grammar && \
	make clean && make -j LLAMA_CURL=1 main && \
	mkdir -p models/7B && \
	hyperfine --warmup 1 -L speed fast,slow './main -mu https://huggingface.co/TheBloke/phi-2-GGUF/resolve/main/phi-2.Q4_K_M.gguf --grammar-file {speed}.grammar -p "List of 10 numbers" --seed 1234'


	-mu https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/resolve/main/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
	( export COMMON_ARGS=(
	-m ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
	--prompt-cache issue4218.bin
	--grammar-file issue4218.gbnf
	-f issue4218.txt
	-c 3400
	) && \
	hyperfine --warmup 1 --runs 10 \
	-L branch grammar-reps-revert,grammar-reps,master \
	--setup "\
	git checkout {branch} && \
	make clean && make -j LLAMA_CURL=1 main && \
	rm -f issue4218.bin && \
	./main ${COMMON_ARGS[*]} -n 1" \
	"BRANCH={branch} \
	./main ${COMMON_ARGS[*]} -n 256 --prompt-cache-ro --seed 12345 --no-display-prompt" \
	)

	( export COMMON_ARGS=(
	-m ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
	--prompt-cache issue4218.bin
	--grammar-file issue4218.gbnf
	-f issue4218.txt
	-c 3400
	) && \
	hyperfine --warmup 0 --runs 3 \
	-L branch grammar-fast,master \
	--setup "\
	git checkout {branch} && \
	make clean && make -j LLAMA_CURL=1 main && \
	rm -f issue4218.bin && \
	./main ${COMMON_ARGS[*]} -n 1" \
	"BRANCH={branch} \
	./main ${COMMON_ARGS[*]} -n 256 --prompt-cache-ro --seed 12345 --no-display-prompt" \
	)


	hyperfine --warmup 1 --runs 3 \
	-L branch grammar-fast,master \
	--setup 'git checkout {branch} && \
	make clean && \
	make -j LLAMA_CURL=1 main' \
	'BRANCH={branch} \
	./main \
	-m ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf \
	-j '"'"'{"items": {"type": "number"}, "minItems": 10, "maxItems": 100}'"'"' \
	-p "JSON list of first 50 integers" \
	--seed 12345 --no-display-prompt'


	make LLAMA_CURL=1 LLAMA_NATIVE=1 LLAMA_OPENBLAS=1 main
	LLAMA_BLIS
	./main -m ~/AI/Models/Meta-Llama-3-70B-Instruct-Q5_K_M.gguf -p "What can you tell me about the latest quantum physics advances?" -ctk q4_0

	( export modes=(
	"LLAMA_NO_ACCELERATE=1"
	"LLAMA_NATIVE=1"
	"LLAMA_NATIVE=1 LLAMA_NO_ACCELERATE=1"
	"LLAMA_NATIVE=1 LLAMA_OPENBLAS=1"
	"LLAMA_NATIVE=1 LLAMA_NO_LLAMAFILE="
	"LLAMA_NATIVE=1 LLAMA_NO_LLAMAFILE="
	"LLAMA_NATIVE=1 LLAMA_BLIS=1"
	)
	hyperfine -L mode "LLAMA_OPENBLAS=1 LLAMA_NATIVE=1,LLAMA_NATIVE=1,,LLAMA_NO_LLAMAFILE=,LLAMA_NO_LLAMAFILE= LLAMA_NATIVE=1,LLAMA_BLIS=1 LLAMA_NATIVE=1" \

	( export params=(
	"LLAMA_ACCELERATE={accelerate}"
	"LLAMA_NATIVE={native}"
	"LLAMA_OPENBLAS={openblas}"
	"LLAMA_BLIS={blis}"
	"LLAMA_NO_LLAMAFILE={no_llamafile}"
	) && \
	hyperfine \
	-L accelerate 0,1 \
	-L native 0,1 \
	-L openblas 0,1 \
	-L blis 0,1 \
	-L no_llamafile ,1 \
	--warmup 1 --runs 5 \
	--setup "make clean && make -j LLAMA_CURL=1 ${params[*]} main" \
	--export-json perf.json \
	--export-markdown perf.md \
	"${params[*]} ./main -m ~/AI/Models/Meta-Llama-3-8B-Instruct-Q4_K_M.gguf -n 128 --seed 42 -p 'What can you tell me about the latest quantum physics advances? Do not shy away from the details, tell me everyting one may or may not want to ever know regardless of their background, level and tastes.'" )


	- Reps: grammar-reps vs. master
	- Deduping: 04a5ac211ef40936295980b7cdf0ba6e97093146 (previous commit = f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7)
	- Reserves: cbaadc92942c50aab599a9e4c163afc1f44f7c26 (previous commit = 1bbdaf6ecda6f0a360dfb307b256fcb6838c560b)
	-L branch grammar-reps,master,04a5ac211ef40936295980b7cdf0ba6e97093146,f7001ccc5aa359fcf41bba19d1c99c3d25c9bcc7,cbaadc92942c50aab599a9e4c163afc1f44f7c26,1bbdaf6ecda6f0a360dfb307b256fcb6838c560b \

	( export MODEL_URL=https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/resolve/main/Hermes-2-Pro-Mistral-7B.Q8_0.gguf && \
	export PROMPT_FILE=issue4218.txt && \
	hyperfine --warmup 1 --runs 10 \
	-L branch grammar-reps,master \
	--setup 'git checkout {branch} && \
	make clean && make -j LLAMA_CURL=1 main && \
	rm -f issue4218.bin && \
	./main -mu $MODEL_URL \
	-c 3400 \
	--prompt-cache issue4218.bin \
	-n 1' \
	'BRANCH={branch} ./main -mu $MODEL_URL \
	-c 3400 \
	--prompt-cache issue4218.bin \
	--prompt-cache-ro \
	--no-display-prompt \
	--seed 12345 \
	-n 256' \
	)



	-m /Users/ochafik/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
	@HanClinto I did a first bit of benchmarking / comparison of outputs on @AlienKevin's grammar from https://github.com/ggerganov/llama.cpp/issues/4218#issuecomment-1836540046 on this PR and it seems to be ~~~8%~~ (edit) 5% slower than master on my Mac.

	Here's the instructions (download [issue4218.gbnf](https://gist.githubusercontent.com/ochafik/e5b1731e4e574ac1aed2e49e5f9c5899/raw/c0c1411377500f6e0d66ec7dbfea22686178986c/issue4218.gbnf) and [issue4218.txt](https://gist.githubusercontent.com/ochafik/e5b1731e4e574ac1aed2e49e5f9c5899/raw/c0c1411377500f6e0d66ec7dbfea22686178986c/issue4218.txt)):

	```bash
	( export COMMON_ARGS=(
	-mu https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/resolve/main/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
	--prompt-cache issue4218.bin
	--grammar-file issue4218.gbnf
	-f issue4218.txt
	-c 3400
	) && \
	hyperfine --warmup 1 --runs 10 \
	-L branch grammar-reps,master \
	--setup "\
	git checkout {branch} && \
	make clean && make -j LLAMA_CURL=1 main && \
	rm -f issue4218.bin && \
	./main ${COMMON_ARGS[*]} -n 1" \
	"BRANCH={branch} \
	./main ${COMMON_ARGS[*]} -n 256 --prompt-cache-ro --seed 12345 --no-display-prompt" \
	)
	```

	(used a prompt cache to benchmark mostly the generation itself and not the long prompt eval; grammar, prompt and script in [this gist](https://gist.github.com/ochafik/e5b1731e4e574ac1aed2e49e5f9c5899)).

	(Interestingly, that same grammar makes master to segfault with sequences that are a bit too long, will probably investigate / report separately ; edit seems to be caused by `-ctk q4_k` regardless of grammar --> https://github.com/ggerganov/llama.cpp/issues/5652)



	hyperfine --warmup 0 --runs 3 -L openblas 1, -L nollamafile ,1 -L tcmalloc 1, \
	--setup 'make clean && make LLAMA_CURL=1 LLAMA_OPENBLAS={openblas} LLAMA_NO_LLAMAFILE={nollamafile} LLAMA_TCMALLOC={tcmalloc} -j main' 'ENV="openblas={openblas} nollamafile={nollamafile} tcmalloc={tcmalloc}" ./main -m ~/AI/Models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf --grammar-file issue4218.gbnf -f issue4218.txt -c 3400 -n 128 --seed 12345 --no-display-prompt --log-disable'

	hyperfine --warmup 1 --runs 5 \
	-L branch grammar-fast,master \
	--setup 'git checkout {branch} && \
	make clean && \
	make -j LLAMA_CURL=1 main' \
	'BRANCH={branch} \
	./main \
	-m models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf \
	-j '"'"'{"items": {"type": "number"}, "minItems": 10, "maxItems": 100}'"'"' \
	-p "JSON list of 50 integers starting from 100000" \
	--seed 12345 --no-display-prompt'


	hyperfine --warmup 0 --runs 3 -L tool cmake-opt,cmake,make,make-openblas \
	--setup './b.sh {tool}' \
	'./r.sh {tool} \
	-mu https://huggingface.co/MoMonir/Phi-3-mini-128k-instruct-GGUF/resolve/main/phi-3-mini-128k-instruct.Q8_0.gguf -p \
	-f /Users/ochafik/github/ai/indexer/paul_graham_essay.txt \
	-c 131072 -n 1 --seed 12345 --no-display-prompt --log-disable'

	cat /Users/ochafik/github/ai/indexer/paul_graham_essay.txt \| head -n 100 > input.txt

	( export COMMON_ARGS=(
	-mu https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/resolve/main/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
	-m models/Hermes-2-Pro-Mistral-7B.Q4_K_M.gguf
	--prompt-cache issue4218.bin
	--grammar-file issue4218.gbnf
	-f issue4218.txt
	-c 3400
	) && \
	hyperfine --warmup 1 --runs 5 \
	-L branch grammar-fast,master \
	--setup "\
	git checkout {branch} && \
	make clean && make -j LLAMA_CURL=1 main && \
	rm -f issue4218.bin && \
	./main ${COMMON_ARGS[*]} -n 1" \
	"BRANCH={branch} \
	./main ${COMMON_ARGS[*]} -n 128 --prompt-cache-ro --seed 12345 --no-display-prompt" )


	hyperfine --warmup 0 --runs 3 -L tool cmake-nonative,cmake-lto,cmake-lto-nonative,cmake,make,make-nonative,make-openblas,make-openblas-nonative \
	--setup './b.sh {tool}' \
	'./r.sh {tool} \
	-mu https://huggingface.co/MoMonir/Phi-3-mini-128k-instruct-GGUF/resolve/main/phi-3-mini-128k-instruct.Q8_0.gguf \
	-f input.txt \
	-c 131072 -n 100 --seed 12345 --no-display-prompt --log-disable'

	cat ../ai/indexer/paul_graham_essay.txt\| head -n 100 > ../input.txt
	hyperfine --warmup 1 --runs 3 -L branch gg/flash-attn,master \
	--setup 'git checkout {branch} && make clean && make -j LLAMA_CURL=1 main' \
	'branch={branch} ./main \
	-m models/Meta-Llama-3-70B-Instruct-Q4_K_M.gguf \
	-f ../input.txt \
	-c 131072 -n 100 --seed 12345 --no-display-prompt --log-disable -ctk q4_0'