Created
June 6, 2023 16:06
-
-
Save soleblaze/5300bea5ebb8236d54368b1940a1d653 to your computer and use it in GitHub Desktop.
llama cpp output using metal on commit 5220a99
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
llama.cpp (5220a99) | |
❯ pwd | |
/Users/soleblaze/git/thirdparty/localai/go-llama/llama.cpp | |
❯ LLAMA_METAL=1 make | |
I llama.cpp build info: | |
I UNAME_S: Darwin | |
I UNAME_P: arm | |
I UNAME_M: arm64 | |
I CFLAGS: -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -DGG | |
ML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_METAL_NDEBUG | |
I CXXFLAGS: -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL | |
I LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders | |
I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1) | |
I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1) | |
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL examples/main/main.cpp | |
ggml.o llama.o common.o ggml-metal.o -o main -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders | |
==== Run ./main -h for help. ==== | |
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL examples/quantize/quant | |
ize.cpp ggml.o llama.o ggml-metal.o -o quantize -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders | |
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL examples/quantize-stats | |
/quantize-stats.cpp ggml.o llama.o ggml-metal.o -o quantize-stats -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerforman | |
ceShaders | |
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL examples/perplexity/per | |
plexity.cpp ggml.o llama.o common.o ggml-metal.o -o perplexity -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceS | |
haders | |
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL examples/embedding/embe | |
dding.cpp ggml.o llama.o common.o ggml-metal.o -o embedding -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShad | |
ers | |
c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL pocs/vdot/vdot.cpp ggml | |
.o ggml-metal.o -o vdot -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders | |
❯ ./main -m ~/models/ggml-model-q4_0.bin -p "hi." -ngl 1 | |
main: build = 615 (5220a99) | |
main: seed = 1686067115 | |
llama.cpp: loading model from /Users/soleblaze/models/ggml-model-q4_0.bin | |
llama.cpp: loading model from /Users/soleblaze/models/ggml-model-q4_0.bin | |
llama_model_load_internal: format = ggjt v3 (latest) | |
llama_model_load_internal: n_vocab = 32000 | |
llama_model_load_internal: n_ctx = 512 | |
llama_model_load_internal: n_embd = 4096 | |
llama_model_load_internal: n_mult = 256 | |
llama_model_load_internal: n_head = 32 | |
llama_model_load_internal: n_layer = 32 | |
llama_model_load_internal: n_rot = 128 | |
llama_model_load_internal: ftype = 2 (mostly Q4_0) | |
llama_model_load_internal: n_ff = 11008 | |
llama_model_load_internal: n_parts = 1 | |
llama_model_load_internal: model size = 7B | |
llama_model_load_internal: ggml ctx size = 0.07 MB | |
llama_model_load_internal: mem required = 1932.71 MB (+ 1026.00 MB per state) | |
. | |
llama_init_from_file: kv self size = 256.00 MB | |
ggml_metal_init: allocating | |
ggml_metal_init: using MPS | |
ggml_metal_init: loading '/Users/soleblaze/git/thirdparty/localai/go-llama/llama.cpp/ggml-metal.metal' | |
ggml_metal_init: loaded kernel_add 0x134709020 | |
ggml_metal_init: loaded kernel_mul 0x134709740 | |
ggml_metal_init: loaded kernel_mul_row 0x134709d70 | |
ggml_metal_init: loaded kernel_scale 0x13470a290 | |
ggml_metal_init: loaded kernel_silu 0x13470a7b0 | |
ggml_metal_init: loaded kernel_relu 0x13470acd0 | |
ggml_metal_init: loaded kernel_soft_max 0x13470b380 | |
ggml_metal_init: loaded kernel_diag_mask_inf 0x13470b9e0 | |
ggml_metal_init: loaded kernel_get_rows_q4_0 0x13470c060 | |
ggml_metal_init: loaded kernel_rms_norm 0x13470c710 | |
ggml_metal_init: loaded kernel_mul_mat_q4_0_f32 0x13470cf70 | |
ggml_metal_init: loaded kernel_mul_mat_f16_f32 0x13470d940 | |
ggml_metal_init: loaded kernel_rope 0x13470e230 | |
ggml_metal_init: loaded kernel_cpy_f32_f16 0x13470eac0 | |
ggml_metal_init: loaded kernel_cpy_f32_f32 0x13470f350 | |
ggml_metal_add_buffer: allocated 'data ' buffer, size = 3616.07 MB | |
ggml_metal_add_buffer: allocated 'eval ' buffer, size = 768.00 MB | |
ggml_metal_add_buffer: allocated 'kv ' buffer, size = 258.00 MB | |
ggml_metal_add_buffer: allocated 'scr0 ' buffer, size = 512.00 MB | |
ggml_metal_add_buffer: allocated 'scr1 ' buffer, size = 512.00 MB | |
system_info: n_threads = 16 / 20 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | | |
sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000 | |
generate: n_ctx = 512, n_batch = 512, n_predict = -1, n_keep = 0 | |
hi. I am interested in your workshop on June 23rd, but I see that you don't have any spaces left. is there anyway I can get on the waiting list? | |
If there are any spaces come up before May 31st, we will take registrations from the waiting list. [end of text] | |
llama_print_timings: load time = 7054.57 ms | |
llama_print_timings: sample time = 45.89 ms / 64 runs ( 0.72 ms per token) | |
llama_print_timings: prompt eval time = 205.91 ms / 3 tokens ( 68.64 ms per token) | |
llama_print_timings: eval time = 1561.02 ms / 63 runs ( 24.78 ms per token) | |
llama_print_timings: total time = 8667.92 ms |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment