This document is a collection of thoughts and observations about the tokenizers used in llama-rooted large language models.
Most language models use the LlamaTokenizer.
| import contextlib | |
| import datasets | |
| from datasets.combine import concatenate_datasets | |
| import json | |
| import os | |
| import pandas as pd | |
| from peft import LoftQConfig, PeftModel, PeftConfig | |
| import random | |
| import torch | |
| from transformers import TrainingArguments |
This document is a collection of thoughts and observations about the tokenizers used in llama-rooted large language models.
Most language models use the LlamaTokenizer.
| # engine/contextflow.py | |
| from ctypes import c_float, c_size_t, c_void_p, c_char, c_int, c_uint8, c_int8, c_int32, pointer, byref | |
| import logging | |
| import multiprocessing | |
| import numpy as np | |
| import os | |
| from typing import Any, List, Optional, Dict | |
| import llama_cpp | |
| from llama_cpp._internals import _LlamaTokenDataArray |