gengwg@gengwg-mbp:~$ git clone https://github.com/ggerganov/llama.cpp.git
Cloning into 'llama.cpp'...
remote: Enumerating objects: 5267, done.
remote: Counting objects: 100% (2065/2065), done.
remote: Compressing objects: 100% (320/320), done.
remote: Total 5267 (delta 1878), reused 1870 (delta 1745), pack-reused 3202
Receiving objects: 100% (5267/5267), 4.24 MiB | 13.48 MiB/s, done.
Resolving deltas: 100% (3604/3604), done.
gengwg@gengwg-mbp:~$ cd llama.cpp/
gengwg@gengwg-mbp:~/llama.cpp$ LLAMA_METAL=1 make
I llama.cpp build info:
I UNAME_S: Darwin
I UNAME_P: arm
I UNAME_M: arm64
I CFLAGS: -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_METAL_NDEBUG
I CXXFLAGS: -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS -DGGML_USE_METAL
I LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
cc -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_METAL_NDEBUG -c ggml.c -o ggml.o
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS -DGGML_USE_METAL -c llama.cpp -o llama.o
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS -DGGML_USE_METAL -c examples/common.cpp -o common.o
cc -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_METAL_NDEBUG -c -o k_quants.o k_quants.c
cc -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_METAL_NDEBUG -c ggml-metal.m -o ggml-metal.o
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS -DGGML_USE_METAL examples/main/main.cpp ggml.o llama.o common.o k_quants.o ggml-metal.o -o main -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
==== Run ./main -h for help. ====
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS -DGGML_USE_METAL examples/quantize/quantize.cpp ggml.o llama.o k_quants.o ggml-metal.o -o quantize -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS -DGGML_USE_METAL examples/quantize-stats/quantize-stats.cpp ggml.o llama.o k_quants.o ggml-metal.o -o quantize-stats -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS -DGGML_USE_METAL examples/perplexity/perplexity.cpp ggml.o llama.o common.o k_quants.o ggml-metal.o -o perplexity -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS -DGGML_USE_METAL examples/embedding/embedding.cpp ggml.o llama.o common.o k_quants.o ggml-metal.o -o embedding -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS -DGGML_USE_METAL pocs/vdot/vdot.cpp ggml.o k_quants.o ggml-metal.o -o vdot -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS -DGGML_USE_METAL examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o k_quants.o ggml-metal.o -o train-text-from-scratch -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS -DGGML_USE_METAL examples/simple/simple.cpp ggml.o llama.o common.o k_quants.o ggml-metal.o -o simple -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS -DGGML_USE_METAL -Iexamples/server examples/server/server.cpp ggml.o llama.o common.o k_quants.o ggml-metal.o -o server -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
c++ --shared -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS -DGGML_USE_METAL examples/embd-input/embd-input-lib.cpp ggml.o llama.o common.o k_quants.o ggml-metal.o -o libembdinput.so -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS -DGGML_USE_METAL examples/embd-input/embd-input-test.cpp ggml.o llama.o common.o k_quants.o ggml-metal.o -o embd-input-test -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders -L. -lembdinput
gengwg@gengwg-mbp:~/llama.cpp$ export MODEL=llama-2-13b-chat.ggmlv3.q4_0.bin
gengwg@gengwg-mbp:~/llama.cpp$ wget "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/${MODEL}"
--2023-07-20 16:50:29-- https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin
Resolving huggingface.co (huggingface.co)... 2600:9000:234c:d800:17:b174:6d00:93a1, 2600:9000:234c:cc00:17:b174:6d00:93a1, 2600:9000:234c:8200:17:b174:6d00:93a1, ...
Connecting to huggingface.co (huggingface.co)|2600:9000:234c:d800:17:b174:6d00:93a1|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/cd/43/cd4356b11767f5136b31b27dbb8863d6dd69a4010e034ef75be9c2c12fcd10f7/f79142715bc9539a2edbb4b253548db8b34fac22736593eeaa28555874476e30?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27llama-2-13b-chat.ggmlv3.q4_0.bin%3B+filename%3D%22llama-2-13b-chat.ggmlv3.q4_0.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1690145884&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5MDE0NTg4NH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy9jZC80My9jZDQzNTZiMTE3NjdmNTEzNmIzMWIyN2RiYjg4NjNkNmRkNjlhNDAxMGUwMzRlZjc1YmU5YzJjMTJmY2QxMGY3L2Y3OTE0MjcxNWJjOTUzOWEyZWRiYjRiMjUzNTQ4ZGI4YjM0ZmFjMjI3MzY1OTNlZWFhMjg1NTU4NzQ0NzZlMzA%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qJnJlc3BvbnNlLWNvbnRlbnQtdHlwZT0qIn1dfQ__&Signature=fNW7W3iJcgv3RrLctLCegcwW8FEXzwkDd95bC2Mfw1xoNuXnn3iUJCczhBlVYVpKD-9Du51emFC6khrd9G%7EyXSqfa8pTBRoEHq%7Ei8bysOH2FHA1HZ4fWYevwFJOSDVdtkemYbIls6-zL8SHL39At8tE8gejlwmW7025O5knEH-zqAONqaXgXqZjipPUnhAurFfDQkI96zFhjU5HVWi-AwRLHPfWzXju5KeT29zfIylk6FxB72XJgQCQQvukJeFBuvOTaM7aZZO3r0NfaWS9607w%7EF5KUYTslfE7%7ETD3hqNMb%7EZjt8HFQRVYJvE7bSgl%7EvjoC7dJ-wIMiCXzvcXHIxQ__&Key-Pair-Id=KVTP0A1DKRTAX [following]
--2023-07-20 16:50:29-- https://cdn-lfs.huggingface.co/repos/cd/43/cd4356b11767f5136b31b27dbb8863d6dd69a4010e034ef75be9c2c12fcd10f7/f79142715bc9539a2edbb4b253548db8b34fac22736593eeaa28555874476e30?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27llama-2-13b-chat.ggmlv3.q4_0.bin%3B+filename%3D%22llama-2-13b-chat.ggmlv3.q4_0.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1690145884&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTY5MDE0NTg4NH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy9jZC80My9jZDQzNTZiMTE3NjdmNTEzNmIzMWIyN2RiYjg4NjNkNmRkNjlhNDAxMGUwMzRlZjc1YmU5YzJjMTJmY2QxMGY3L2Y3OTE0MjcxNWJjOTUzOWEyZWRiYjRiMjUzNTQ4ZGI4YjM0ZmFjMjI3MzY1OTNlZWFhMjg1NTU4NzQ0NzZlMzA%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qJnJlc3BvbnNlLWNvbnRlbnQtdHlwZT0qIn1dfQ__&Signature=fNW7W3iJcgv3RrLctLCegcwW8FEXzwkDd95bC2Mfw1xoNuXnn3iUJCczhBlVYVpKD-9Du51emFC6khrd9G%7EyXSqfa8pTBRoEHq%7Ei8bysOH2FHA1HZ4fWYevwFJOSDVdtkemYbIls6-zL8SHL39At8tE8gejlwmW7025O5knEH-zqAONqaXgXqZjipPUnhAurFfDQkI96zFhjU5HVWi-AwRLHPfWzXju5KeT29zfIylk6FxB72XJgQCQQvukJeFBuvOTaM7aZZO3r0NfaWS9607w%7EF5KUYTslfE7%7ETD3hqNMb%7EZjt8HFQRVYJvE7bSgl%7EvjoC7dJ-wIMiCXzvcXHIxQ__&Key-Pair-Id=KVTP0A1DKRTAX
Resolving cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)... 2600:9000:25f1:6e00:11:f807:5180:93a1, 2600:9000:25f1:e400:11:f807:5180:93a1, 2600:9000:25f1:c00:11:f807:5180:93a1, ...
Connecting to cdn-lfs.huggingface.co (cdn-lfs.huggingface.co)|2600:9000:25f1:6e00:11:f807:5180:93a1|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7323305088 (6.8G) [application/octet-stream]
Saving to: ‘llama-2-13b-chat.ggmlv3.q4_0.bin’
llama-2-13b-chat.ggmlv3.q4_0.bin 100%[====================================================================================================================================================================================================================================================>] 6.82G 49.1MB/s in 2m 57s
2023-07-20 16:53:27 (39.4 MB/s) - ‘llama-2-13b-chat.ggmlv3.q4_0.bin’ saved [7323305088/7323305088]
gengwg@gengwg-mbp:~/llama.cpp$ echo "Prompt: " \
> && read PROMPT \
> && ./main \
> -t 8 \
> -ngl 1 \
> -m ${MODEL} \
> --color \
> -c 2048 \
> --temp 0.7 \
> --repeat_penalty 1.1 \
> -n -1 \
> -p "[INST] ${PROMPT} [/INST] "
Prompt:
Hello Llama!
main: build = 856 (e782c9e)
main: seed = 1689897236
llama.cpp: loading model from llama-2-13b-chat.ggmlv3.q4_0.bin
llama_model_load_internal: format = ggjt v3 (latest)
llama_model_load_internal: n_vocab = 32000
llama_model_load_internal: n_ctx = 2048
llama_model_load_internal: n_embd = 5120
llama_model_load_internal: n_mult = 256
llama_model_load_internal: n_head = 40
llama_model_load_internal: n_layer = 40
llama_model_load_internal: n_rot = 128
llama_model_load_internal: freq_base = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype = 2 (mostly Q4_0)
llama_model_load_internal: n_ff = 13824
llama_model_load_internal: model size = 13B
llama_model_load_internal: ggml ctx size = 0.09 MB
llama_model_load_internal: mem required = 8953.71 MB (+ 1608.00 MB per state)
llama_new_context_with_model: kv self size = 1600.00 MB
ggml_metal_init: allocating
ggml_metal_init: using MPS
ggml_metal_init: loading '/Users/gengwg/llama.cpp/ggml-metal.metal'
ggml_metal_init: loaded kernel_add 0x136f05780
ggml_metal_init: loaded kernel_mul 0x136f060a0
ggml_metal_init: loaded kernel_mul_row 0x136e08430
ggml_metal_init: loaded kernel_scale 0x136e08a60
ggml_metal_init: loaded kernel_silu 0x136e08f80
ggml_metal_init: loaded kernel_relu 0x136f064a0
ggml_metal_init: loaded kernel_gelu 0x136f06ae0
ggml_metal_init: loaded kernel_soft_max 0x136e096c0
ggml_metal_init: loaded kernel_diag_mask_inf 0x136e09e60
ggml_metal_init: loaded kernel_get_rows_f16 0x11b406ae0
ggml_metal_init: loaded kernel_get_rows_q4_0 0x136e0a3e0
ggml_metal_init: loaded kernel_get_rows_q4_1 0x136e0ae90
ggml_metal_init: loaded kernel_get_rows_q2_K 0x136e0b530
ggml_metal_init: loaded kernel_get_rows_q3_K 0x136e0bbd0
ggml_metal_init: loaded kernel_get_rows_q4_K 0x136e0c270
ggml_metal_init: loaded kernel_get_rows_q5_K 0x136e0c910
ggml_metal_init: loaded kernel_get_rows_q6_K 0x136e0cfb0
ggml_metal_init: loaded kernel_rms_norm 0x136f07240
ggml_metal_init: loaded kernel_norm 0x136e0d6f0
ggml_metal_init: loaded kernel_mul_mat_f16_f32 0x136e0e420
ggml_metal_init: loaded kernel_mul_mat_q4_0_f32 0x136e0e7a0
ggml_metal_init: loaded kernel_mul_mat_q4_1_f32 0x136e0ee80
ggml_metal_init: loaded kernel_mul_mat_q2_K_f32 0x136f07960
ggml_metal_init: loaded kernel_mul_mat_q3_K_f32 0x136e0f970
ggml_metal_init: loaded kernel_mul_mat_q4_K_f32 0x136e10050
ggml_metal_init: loaded kernel_mul_mat_q5_K_f32 0x136e10690
ggml_metal_init: loaded kernel_mul_mat_q6_K_f32 0x136f08020
ggml_metal_init: loaded kernel_rope 0x136e10bb0
ggml_metal_init: loaded kernel_alibi_f32 0x136e11590
ggml_metal_init: loaded kernel_cpy_f32_f16 0x136e12450
ggml_metal_init: loaded kernel_cpy_f32_f32 0x136e12d00
ggml_metal_init: loaded kernel_cpy_f16_f16 0x136f08c40
ggml_metal_init: recommendedMaxWorkingSetSize = 21845.34 MB
ggml_metal_init: hasUnifiedMemory = true
ggml_metal_init: maxTransferRate = built-in GPU
llama_new_context_with_model: max tensor size = 87.89 MB
ggml_metal_add_buffer: allocated 'data ' buffer, size = 6984.06 MB, ( 6984.52 / 21845.34)
ggml_metal_add_buffer: allocated 'eval ' buffer, size = 1032.00 MB, ( 8016.52 / 21845.34)
ggml_metal_add_buffer: allocated 'kv ' buffer, size = 1602.00 MB, ( 9618.52 / 21845.34)
ggml_metal_add_buffer: allocated 'scr0 ' buffer, size = 426.00 MB, (10044.52 / 21845.34)
ggml_metal_add_buffer: allocated 'scr1 ' buffer, size = 512.00 MB, (10556.52 / 21845.34)
system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.700000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
generate: n_ctx = 2048, n_batch = 512, n_predict = -1, n_keep = 0
[INST] Hello Llama! [/INST] Hello there! I'm a large language model, but you can call me Llama if you like. How may I assist you today? [end of text]
llama_print_timings: load time = 10938.20 ms
llama_print_timings: sample time = 21.73 ms / 31 runs ( 0.70 ms per token, 1426.73 tokens per second)
llama_print_timings: prompt eval time = 1190.98 ms / 14 tokens ( 85.07 ms per token, 11.76 tokens per second)
llama_print_timings: eval time = 1676.02 ms / 30 runs ( 55.87 ms per token, 17.90 tokens per second)
llama_print_timings: total time = 2891.00 ms
ggml_metal_free: deallocating
gengwg@gengwg-mbp:~/llama.cpp$ echo "Prompt: " && read PROMPT && ./main -t 8 -ngl 1 -m ${MODEL} --color -c 2048 --temp 0.7 --repeat_penalty 1.1 -n -1 -p "[INST] ${PROMPT} [/INST] "
Prompt:
Tell me a joke about Llama.
main: build = 856 (e782c9e)
main: seed = 1689897312
llama.cpp: loading model from llama-2-13b-chat.ggmlv3.q4_0.bin
llama_model_load_internal: format = ggjt v3 (latest)
llama_model_load_internal: n_vocab = 32000
llama_model_load_internal: n_ctx = 2048
llama_model_load_internal: n_embd = 5120
llama_model_load_internal: n_mult = 256
llama_model_load_internal: n_head = 40
llama_model_load_internal: n_layer = 40
llama_model_load_internal: n_rot = 128
llama_model_load_internal: freq_base = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype = 2 (mostly Q4_0)
llama_model_load_internal: n_ff = 13824
llama_model_load_internal: model size = 13B
llama_model_load_internal: ggml ctx size = 0.09 MB
llama_model_load_internal: mem required = 8953.71 MB (+ 1608.00 MB per state)
llama_new_context_with_model: kv self size = 1600.00 MB
ggml_metal_init: allocating
ggml_metal_init: using MPS
ggml_metal_init: loading '/Users/gengwg/llama.cpp/ggml-metal.metal'
ggml_metal_init: loaded kernel_add 0x13b706ba0
ggml_metal_init: loaded kernel_mul 0x13b7075e0
ggml_metal_init: loaded kernel_mul_row 0x13b707c10
ggml_metal_init: loaded kernel_scale 0x13b708130
ggml_metal_init: loaded kernel_silu 0x13b708650
ggml_metal_init: loaded kernel_relu 0x13b708b70
ggml_metal_init: loaded kernel_gelu 0x13b709090
ggml_metal_init: loaded kernel_soft_max 0x13b709740
ggml_metal_init: loaded kernel_diag_mask_inf 0x13b709da0
ggml_metal_init: loaded kernel_get_rows_f16 0x13b70a420
ggml_metal_init: loaded kernel_get_rows_q4_0 0x13b70aaa0
ggml_metal_init: loaded kernel_get_rows_q4_1 0x13b70b290
ggml_metal_init: loaded kernel_get_rows_q2_K 0x13b70b910
ggml_metal_init: loaded kernel_get_rows_q3_K 0x13b70bf90
ggml_metal_init: loaded kernel_get_rows_q4_K 0x13b70c610
ggml_metal_init: loaded kernel_get_rows_q5_K 0x13b70cc90
ggml_metal_init: loaded kernel_get_rows_q6_K 0x13b70d310
ggml_metal_init: loaded kernel_rms_norm 0x13b70d9d0
ggml_metal_init: loaded kernel_norm 0x13b70e080
ggml_metal_init: loaded kernel_mul_mat_f16_f32 0x13b70ea50
ggml_metal_init: loaded kernel_mul_mat_q4_0_f32 0x13b70f110
ggml_metal_init: loaded kernel_mul_mat_q4_1_f32 0x13b70f7d0
ggml_metal_init: loaded kernel_mul_mat_q2_K_f32 0x13b70feb0
ggml_metal_init: loaded kernel_mul_mat_q3_K_f32 0x13b710730
ggml_metal_init: loaded kernel_mul_mat_q4_K_f32 0x13b710df0
ggml_metal_init: loaded kernel_mul_mat_q5_K_f32 0x13b711490
ggml_metal_init: loaded kernel_mul_mat_q6_K_f32 0x13b711b30
ggml_metal_init: loaded kernel_rope 0x13b712250
ggml_metal_init: loaded kernel_alibi_f32 0x13b712d70
ggml_metal_init: loaded kernel_cpy_f32_f16 0x13b713600
ggml_metal_init: loaded kernel_cpy_f32_f32 0x13b713e90
ggml_metal_init: loaded kernel_cpy_f16_f16 0x13b714720
ggml_metal_init: recommendedMaxWorkingSetSize = 21845.34 MB
ggml_metal_init: hasUnifiedMemory = true
ggml_metal_init: maxTransferRate = built-in GPU
llama_new_context_with_model: max tensor size = 87.89 MB
ggml_metal_add_buffer: allocated 'data ' buffer, size = 6984.06 MB, ( 6984.52 / 21845.34)
ggml_metal_add_buffer: allocated 'eval ' buffer, size = 1032.00 MB, ( 8016.52 / 21845.34)
ggml_metal_add_buffer: allocated 'kv ' buffer, size = 1602.00 MB, ( 9618.52 / 21845.34)
ggml_metal_add_buffer: allocated 'scr0 ' buffer, size = 426.00 MB, (10044.52 / 21845.34)
ggml_metal_add_buffer: allocated 'scr1 ' buffer, size = 512.00 MB, (10556.52 / 21845.34)
system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.700000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
generate: n_ctx = 2048, n_batch = 512, n_predict = -1, n_keep = 0
[INST] Tell me a joke about Llama. [/INST] Sure, here's one for you:
Why did the llama refuse to play poker?
Because he always got fleeced!
(Get it? Fleeced like a llama's woolly coat... oh well, maybe it's just a lame joke!) [end of text]
llama_print_timings: load time = 4603.90 ms
llama_print_timings: sample time = 47.68 ms / 68 runs ( 0.70 ms per token, 1426.17 tokens per second)
llama_print_timings: prompt eval time = 1425.97 ms / 19 tokens ( 75.05 ms per token, 13.32 tokens per second)
llama_print_timings: eval time = 4094.24 ms / 67 runs ( 61.11 ms per token, 16.36 tokens per second)
llama_print_timings: total time = 5573.80 ms
ggml_metal_free: deallocating