ink-splatters · May 5, 2024 01:32
diff --git a/gistfile1.sh b/gistfile1.sh
 airstation:llama.cpp ic$ git rev-parse HEAD
 952d03dbead16e4dbdd1d3458486340673cc2465
 airstation:llama.cpp ic$ echo ; awk '(NR>=4341 &&  NR<=4382 ){print NR " " $0}' llama.cpp

 4341         // for now, only BPE models have pre-tokenizers
 4342         if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
 4343             if (tokenizer_pre.empty()) {
 4344                 LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
 4345                 LLAMA_LOG_WARN("%s:                                             \n", __func__);
 4346                 LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
 4347                 LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED!        \n", __func__);
 4348                 LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL             \n", __func__);
 4349                 LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
 4350                 LLAMA_LOG_WARN("%s:                                             \n", __func__);
 4351                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
 4352             } else if (
 4353                     tokenizer_pre == "default") {
 4354                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
 4355             } else if (
 4356                     tokenizer_pre == "llama3"   ||
 4357                     tokenizer_pre == "llama-v3" ||
 4358                     tokenizer_pre == "llama-bpe") {
 4359                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
 4360             } else if (
 4361                     tokenizer_pre == "deepseek-llm") {
 4362                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
 4363             } else if (
 4364                     tokenizer_pre == "deepseek-coder") {
 4365                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
 4366             } else if (
 4367                     tokenizer_pre == "falcon") {
 4368                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
 4369             } else if (
 4370                     tokenizer_pre == "mpt") {
 4371                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
 4372             } else if (
 4373                     tokenizer_pre == "starcoder") {
 4374                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
 4375             } else if (
 4376                     tokenizer_pre == "gpt-2") {
 4377                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
 4378             } else {
 4379                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
 4380             }
 4381         } else {
 4382             vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
	airstation:llama.cpp ic$ git rev-parse HEAD
	952d03dbead16e4dbdd1d3458486340673cc2465
	airstation:llama.cpp ic$ echo ; awk '(NR>=4341 && NR<=4382 ){print NR " " $0}' llama.cpp

	4341 // for now, only BPE models have pre-tokenizers
	4342 if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
	4343 if (tokenizer_pre.empty()) {
	4344 LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
	4345 LLAMA_LOG_WARN("%s: \n", __func__);
	4346 LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
	4347 LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
	4348 LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
	4349 LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
	4350 LLAMA_LOG_WARN("%s: \n", __func__);
	4351 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
	4352 } else if (
	4353 tokenizer_pre == "default") {
	4354 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
	4355 } else if (
	4356 tokenizer_pre == "llama3" \|\|
	4357 tokenizer_pre == "llama-v3" \|\|
	4358 tokenizer_pre == "llama-bpe") {
	4359 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
	4360 } else if (
	4361 tokenizer_pre == "deepseek-llm") {
	4362 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
	4363 } else if (
	4364 tokenizer_pre == "deepseek-coder") {
	4365 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
	4366 } else if (
	4367 tokenizer_pre == "falcon") {
	4368 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
	4369 } else if (
	4370 tokenizer_pre == "mpt") {
	4371 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
	4372 } else if (
	4373 tokenizer_pre == "starcoder") {
	4374 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
	4375 } else if (
	4376 tokenizer_pre == "gpt-2") {
	4377 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
	4378 } else {
	4379 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
	4380 }
	4381 } else {
	4382 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;