Created
May 5, 2024 01:32
-
-
Save ink-splatters/d0899a4749261b7dfd4818f40ff062b4 to your computer and use it in GitHub Desktop.
llama.cpp choosing BPE pre-tokenizer logic
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
airstation:llama.cpp ic$ git rev-parse HEAD | |
952d03dbead16e4dbdd1d3458486340673cc2465 | |
airstation:llama.cpp ic$ echo ; awk '(NR>=4341 && NR<=4382 ){print NR " " $0}' llama.cpp | |
4341 // for now, only BPE models have pre-tokenizers | |
4342 if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { | |
4343 if (tokenizer_pre.empty()) { | |
4344 LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__); | |
4345 LLAMA_LOG_WARN("%s: \n", __func__); | |
4346 LLAMA_LOG_WARN("%s: ************************************ \n", __func__); | |
4347 LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__); | |
4348 LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__); | |
4349 LLAMA_LOG_WARN("%s: ************************************ \n", __func__); | |
4350 LLAMA_LOG_WARN("%s: \n", __func__); | |
4351 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; | |
4352 } else if ( | |
4353 tokenizer_pre == "default") { | |
4354 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; | |
4355 } else if ( | |
4356 tokenizer_pre == "llama3" || | |
4357 tokenizer_pre == "llama-v3" || | |
4358 tokenizer_pre == "llama-bpe") { | |
4359 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; | |
4360 } else if ( | |
4361 tokenizer_pre == "deepseek-llm") { | |
4362 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM; | |
4363 } else if ( | |
4364 tokenizer_pre == "deepseek-coder") { | |
4365 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER; | |
4366 } else if ( | |
4367 tokenizer_pre == "falcon") { | |
4368 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON; | |
4369 } else if ( | |
4370 tokenizer_pre == "mpt") { | |
4371 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT; | |
4372 } else if ( | |
4373 tokenizer_pre == "starcoder") { | |
4374 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER; | |
4375 } else if ( | |
4376 tokenizer_pre == "gpt-2") { | |
4377 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; | |
4378 } else { | |
4379 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); | |
4380 } | |
4381 } else { | |
4382 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment