soleblaze · June 6, 2023 16:06
diff --git a/gistfile1.txt b/gistfile1.txt
 llama.cpp (5220a99)
 ❯ pwd
 /Users/soleblaze/git/thirdparty/localai/go-llama/llama.cpp

 ❯ LLAMA_METAL=1 make
 I llama.cpp build info:
 I UNAME_S:  Darwin
 I UNAME_P:  arm
 I UNAME_M:  arm64
 I CFLAGS:   -I.              -O3 -std=c11   -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -DGG
 ML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_METAL_NDEBUG
 I CXXFLAGS: -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL
 I LDFLAGS:   -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 I CC:       Apple clang version 14.0.3 (clang-1403.0.22.14.1)
 I CXX:      Apple clang version 14.0.3 (clang-1403.0.22.14.1)

 c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL examples/main/main.cpp
 ggml.o llama.o common.o ggml-metal.o -o main  -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders

 ====  Run ./main -h for help.  ====

 c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL examples/quantize/quant
 ize.cpp ggml.o llama.o ggml-metal.o -o quantize  -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL examples/quantize-stats
 /quantize-stats.cpp ggml.o llama.o ggml-metal.o -o quantize-stats  -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerforman
 ceShaders
 c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL examples/perplexity/per
 plexity.cpp ggml.o llama.o common.o ggml-metal.o -o perplexity  -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceS
 haders
 c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL examples/embedding/embe
 dding.cpp ggml.o llama.o common.o ggml-metal.o -o embedding  -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShad
 ers
 c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL pocs/vdot/vdot.cpp ggml
 .o ggml-metal.o -o vdot  -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders

 ❯ ./main -m ~/models/ggml-model-q4_0.bin -p "hi." -ngl 1
 main: build = 615 (5220a99)
 main: seed  = 1686067115
 llama.cpp: loading model from /Users/soleblaze/models/ggml-model-q4_0.bin
 llama.cpp: loading model from /Users/soleblaze/models/ggml-model-q4_0.bin
 llama_model_load_internal: format     = ggjt v3 (latest)
 llama_model_load_internal: n_vocab    = 32000
 llama_model_load_internal: n_ctx      = 512
 llama_model_load_internal: n_embd     = 4096
 llama_model_load_internal: n_mult     = 256
 llama_model_load_internal: n_head     = 32
 llama_model_load_internal: n_layer    = 32
 llama_model_load_internal: n_rot      = 128
 llama_model_load_internal: ftype      = 2 (mostly Q4_0)
 llama_model_load_internal: n_ff       = 11008
 llama_model_load_internal: n_parts    = 1
 llama_model_load_internal: model size = 7B
 llama_model_load_internal: ggml ctx size =    0.07 MB
 llama_model_load_internal: mem required  = 1932.71 MB (+ 1026.00 MB per state)
 .
 llama_init_from_file: kv self size  =  256.00 MB
 ggml_metal_init: allocating
 ggml_metal_init: using MPS
 ggml_metal_init: loading '/Users/soleblaze/git/thirdparty/localai/go-llama/llama.cpp/ggml-metal.metal'
 ggml_metal_init: loaded kernel_add                            0x134709020
 ggml_metal_init: loaded kernel_mul                            0x134709740
 ggml_metal_init: loaded kernel_mul_row                        0x134709d70
 ggml_metal_init: loaded kernel_scale                          0x13470a290
 ggml_metal_init: loaded kernel_silu                           0x13470a7b0
 ggml_metal_init: loaded kernel_relu                           0x13470acd0
 ggml_metal_init: loaded kernel_soft_max                       0x13470b380
 ggml_metal_init: loaded kernel_diag_mask_inf                  0x13470b9e0
 ggml_metal_init: loaded kernel_get_rows_q4_0                  0x13470c060
 ggml_metal_init: loaded kernel_rms_norm                       0x13470c710
 ggml_metal_init: loaded kernel_mul_mat_q4_0_f32               0x13470cf70
 ggml_metal_init: loaded kernel_mul_mat_f16_f32                0x13470d940
 ggml_metal_init: loaded kernel_rope                           0x13470e230
 ggml_metal_init: loaded kernel_cpy_f32_f16                    0x13470eac0
 ggml_metal_init: loaded kernel_cpy_f32_f32                    0x13470f350
 ggml_metal_add_buffer: allocated 'data            ' buffer, size =  3616.07 MB
 ggml_metal_add_buffer: allocated 'eval            ' buffer, size =   768.00 MB
 ggml_metal_add_buffer: allocated 'kv              ' buffer, size =   258.00 MB
 ggml_metal_add_buffer: allocated 'scr0            ' buffer, size =   512.00 MB
 ggml_metal_add_buffer: allocated 'scr1            ' buffer, size =   512.00 MB

 system_info: n_threads = 16 / 20 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 |
 sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
 generate: n_ctx = 512, n_batch = 512, n_predict = -1, n_keep = 0

 hi. I am interested in your workshop on June 23rd, but I see that you don't have any spaces left. is there anyway I can get on the waiting list?
 If there are any spaces come up before May 31st, we will take registrations from the waiting list. [end of text]

 llama_print_timings:        load time =  7054.57 ms
 llama_print_timings:      sample time =    45.89 ms /    64 runs   (    0.72 ms per token)
 llama_print_timings: prompt eval time =   205.91 ms /     3 tokens (   68.64 ms per token)
 llama_print_timings:        eval time =  1561.02 ms /    63 runs   (   24.78 ms per token)
 llama_print_timings:       total time =  8667.92 ms
	llama.cpp (5220a99)
	❯ pwd
	/Users/soleblaze/git/thirdparty/localai/go-llama/llama.cpp

	❯ LLAMA_METAL=1 make
	I llama.cpp build info:
	I UNAME_S: Darwin
	I UNAME_P: arm
	I UNAME_M: arm64
	I CFLAGS: -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -pthread -DGG
	ML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_METAL_NDEBUG
	I CXXFLAGS: -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL
	I LDFLAGS: -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
	I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1)
	I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1)

	c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL examples/main/main.cpp
	ggml.o llama.o common.o ggml-metal.o -o main -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders

	==== Run ./main -h for help. ====

	c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL examples/quantize/quant
	ize.cpp ggml.o llama.o ggml-metal.o -o quantize -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
	c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL examples/quantize-stats
	/quantize-stats.cpp ggml.o llama.o ggml-metal.o -o quantize-stats -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerforman
	ceShaders
	c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL examples/perplexity/per
	plexity.cpp ggml.o llama.o common.o ggml-metal.o -o perplexity -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceS
	haders
	c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL examples/embedding/embe
	dding.cpp ggml.o llama.o common.o ggml-metal.o -o embedding -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShad
	ers
	c++ -I. -I./examples -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_METAL pocs/vdot/vdot.cpp ggml
	.o ggml-metal.o -o vdot -framework Accelerate -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders

	❯ ./main -m ~/models/ggml-model-q4_0.bin -p "hi." -ngl 1
	main: build = 615 (5220a99)
	main: seed = 1686067115
	llama.cpp: loading model from /Users/soleblaze/models/ggml-model-q4_0.bin
	llama.cpp: loading model from /Users/soleblaze/models/ggml-model-q4_0.bin
	llama_model_load_internal: format = ggjt v3 (latest)
	llama_model_load_internal: n_vocab = 32000
	llama_model_load_internal: n_ctx = 512
	llama_model_load_internal: n_embd = 4096
	llama_model_load_internal: n_mult = 256
	llama_model_load_internal: n_head = 32
	llama_model_load_internal: n_layer = 32
	llama_model_load_internal: n_rot = 128
	llama_model_load_internal: ftype = 2 (mostly Q4_0)
	llama_model_load_internal: n_ff = 11008
	llama_model_load_internal: n_parts = 1
	llama_model_load_internal: model size = 7B
	llama_model_load_internal: ggml ctx size = 0.07 MB
	llama_model_load_internal: mem required = 1932.71 MB (+ 1026.00 MB per state)
	.
	llama_init_from_file: kv self size = 256.00 MB
	ggml_metal_init: allocating
	ggml_metal_init: using MPS
	ggml_metal_init: loading '/Users/soleblaze/git/thirdparty/localai/go-llama/llama.cpp/ggml-metal.metal'
	ggml_metal_init: loaded kernel_add 0x134709020
	ggml_metal_init: loaded kernel_mul 0x134709740
	ggml_metal_init: loaded kernel_mul_row 0x134709d70
	ggml_metal_init: loaded kernel_scale 0x13470a290
	ggml_metal_init: loaded kernel_silu 0x13470a7b0
	ggml_metal_init: loaded kernel_relu 0x13470acd0
	ggml_metal_init: loaded kernel_soft_max 0x13470b380
	ggml_metal_init: loaded kernel_diag_mask_inf 0x13470b9e0
	ggml_metal_init: loaded kernel_get_rows_q4_0 0x13470c060
	ggml_metal_init: loaded kernel_rms_norm 0x13470c710
	ggml_metal_init: loaded kernel_mul_mat_q4_0_f32 0x13470cf70
	ggml_metal_init: loaded kernel_mul_mat_f16_f32 0x13470d940
	ggml_metal_init: loaded kernel_rope 0x13470e230
	ggml_metal_init: loaded kernel_cpy_f32_f16 0x13470eac0
	ggml_metal_init: loaded kernel_cpy_f32_f32 0x13470f350
	ggml_metal_add_buffer: allocated 'data ' buffer, size = 3616.07 MB
	ggml_metal_add_buffer: allocated 'eval ' buffer, size = 768.00 MB
	ggml_metal_add_buffer: allocated 'kv ' buffer, size = 258.00 MB
	ggml_metal_add_buffer: allocated 'scr0 ' buffer, size = 512.00 MB
	ggml_metal_add_buffer: allocated 'scr1 ' buffer, size = 512.00 MB

	system_info: n_threads = 16 / 20 \| AVX = 0 \| AVX2 = 0 \| AVX512 = 0 \| AVX512_VBMI = 0 \| AVX512_VNNI = 0 \| FMA = 0 \| NEON = 1 \| ARM_FMA = 1 \| F16C = 0 \| FP16_VA = 1 \| WASM_SIMD = 0 \| BLAS = 1 \| SSE3 = 0 \| VSX = 0 \|
	sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000
	generate: n_ctx = 512, n_batch = 512, n_predict = -1, n_keep = 0

	hi. I am interested in your workshop on June 23rd, but I see that you don't have any spaces left. is there anyway I can get on the waiting list?
	If there are any spaces come up before May 31st, we will take registrations from the waiting list. [end of text]

	llama_print_timings: load time = 7054.57 ms
	llama_print_timings: sample time = 45.89 ms / 64 runs ( 0.72 ms per token)
	llama_print_timings: prompt eval time = 205.91 ms / 3 tokens ( 68.64 ms per token)
	llama_print_timings: eval time = 1561.02 ms / 63 runs ( 24.78 ms per token)
	llama_print_timings: total time = 8667.92 ms