See ggml-org/llama.cpp#1602 (comment) And now ggml-org/ggml#231
# Create a fresh python environment just for the conversion script.
# (skip these two conda commands if you're happy installing these common packages in your current python env)
# You can install an open-source conda with instructions from here: https://github.com/conda-forge/miniforge
conda create -n jploski-ggml-falcon python=3.9 -y
conda activate jploski-ggml-falcon
pip install -qq torch numpy transformers
git clone https://huggingface.co/tiiuae/falcon-40b-instruct
# git clone https://huggingface.co/tiiuae/falcon-40b
git clone https://huggingface.co/tiiuae/falcon-7b-instruct
# git clone https://huggingface.co/tiiuae/falcon-7b
git clone https://github.com/jploski/ggml jploski-ggml && cd jploski-ggml && git switch falcon40b
# git clone https://github.com/apage43/ggml apage43-ggml && cd apage43-ggml && git switch falcon
( mkdir -p build && cd build && cmake .. && make -j4 falcon falcon-quantize )
# Times indicated for a MBP M2 Max 96GB
# Conversion takes 37sec
# The 40b model is in 9 parts
time python3 examples/falcon/convert-hf-to-ggml.py 9 ../falcon-40b-instruct{,}
time python3 examples/falcon/convert-hf-to-ggml.py 2 ../falcon-7b-instruct{,}
# Each quantization takes 2min8sec for 40B
for m in falcon-7b-instruct falcon-40b-instruct ; do
build/bin/falcon-quantize ../$m/ggml-model-$m-{f16,q4_0}.bin 2
build/bin/falcon-quantize ../$m/ggml-model-$m-{f16,q4_1}.bin 3
build/bin/falcon-quantize ../$m/ggml-model-$m-{f16,q5_0}.bin 8
build/bin/falcon-quantize ../$m/ggml-model-$m-{f16,q5_1}.bin 9
build/bin/falcon-quantize ../$m/ggml-model-$m-{f16,q8_0}.bin 7
done
for m in falcon-7b-instruct falcon-40b-instruct ; do
for q in q4_0 q4_1 q5_0 q5_1 q8_0 f16 ; do
build/bin/falcon -m ../$m/ggml-model-$m-$q.bin -p "Say something funny." --n_predict 10
done
done
# Inference works!
build/bin/falcon -m ../falcon-40b-instruct/ggml-model-falcon-40b-instruct-q4_0.bin -p "Roses are red, sky is blue. What does this tell us about the meaning of life?"
build/bin/falcon -m ../falcon-7b-instruct/ggml-model-falcon-7b-instruct-q8_0.bin -p "Roses are red, sky is blue. What does this tell us about the meaning of life?"
for q in q4_0 q4_1 q5_0 q5_1 q8_0 ; do
build/bin/falcon -m ../falcon-40b-instruct/ggml-model-falcon-40b-instruct-$q.bin -p "Say something funny." --
done
for m in open_llama_13b_600bt open_llama_3b open_llama_7b ; do
for q in q8_0 ; do
./quantize ./models/$m/ggml-model-f16.bin ./models/$m/ggml-model-${q}.bin $q
done
done
conda create -n lit-llama python=3.10 -y
conda activate lit-llama
git clone https://github.com/Lightning-AI/lit-llama
cd lit-llama
pip install -r requirements.txt
git lfs install
git clone https://huggingface.co/openlm-research/open_llama_7b checkpoints/open-llama/7B
python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/open-llama/7B --model_size 7B
export PYTORCH_ENABLE_MPS_FALLBACK=1
python generate.py --prompt "Hello, my name is" --max_new_tokens=128
# Time for inference 1: 28.14 sec total, 4.55 tokens/sec
python generate.py --prompt "Hello, my name is" --max_new_tokens=128 --accelerator=cpu
# Time for inference 1: 68.79 sec total, 1.86 tokens/sec
python scripts/prepare_alpaca.py
python finetune/lora.py
python finetune/adapter.py# conda env delete -n lit-parrot
conda create -n lit-parrot python=3.10
conda activate lit-parrot
git clone https://github.com/Lightning-AI/lit-parrot
cd lit-parrot
pip install --index-url https://download.pytorch.org/whl/nightly/cpu --pre 'torch>=2.1.0dev'
pip install -r requirements.txt
pip install huggingface_hub
# python scripts/download.py --repo_id tiiuae/falcon-7b
python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/tiiuae/falcon-7b
export PYTORCH_ENABLE_MPS_FALLBACK=1
python generate/base.py --prompt "Hello, my name is" --checkpoint_dir checkpoints/tiiuae/falcon-7b --precision 16-mixed