Skip to content

Instantly share code, notes, and snippets.

Created July 25, 2024 15:28
Show Gist options
  • Save Norod/9f6af9042be4fbc377dc220674c93cf4 to your computer and use it in GitHub Desktop.
Save Norod/9f6af9042be4fbc377dc220674c93cf4 to your computer and use it in GitHub Desktop.
Given two BPE tokenizers, combine them and create a new tokenizer
Given two tokenizers, combine them and create a new tokenizer
Usage: python --tokenizer1 ./SmolLM-135M --tokenizer2 ./hebrew-14k --save_dir ./combined
# Libraries for tokenizer
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
import argparse
import json
import os
from tqdm import tqdm
from transformers import AutoTokenizer
from timeit import default_timer as timer
import sys
def combine_tokenizers(args):
# Load both the json files, take the union, and store it
json1 = json.load(open(os.path.join(args.tokenizer1, 'vocab.json')))
json2 = json.load(open(os.path.join(args.tokenizer2, 'vocab.json')))
# Create a new vocabulary
new_vocab = {}
idx = 0
for word in json1.keys():
if word not in new_vocab.keys():
new_vocab[word] = idx
idx += 1
# Add words from second tokenizer
for word in json2.keys():
if word not in new_vocab.keys():
new_vocab[word] = idx
idx += 1
# Make the directory if necessary
if not os.path.exists(args.save_dir):
# Save the vocab
with open(os.path.join(args.save_dir, 'vocab.json'), 'w') as fp:
json.dump(new_vocab, fp, ensure_ascii=False)
# Merge the two merges file. Don't handle duplicates here
# Concatenate them, but ignore the first line of the second file
os.system('cat {} > {}'.format(os.path.join(args.tokenizer1, 'merges.txt'), os.path.join(args.save_dir, 'merges.txt')))
os.system('tail -n +2 -q {} >> {}'.format(os.path.join(args.tokenizer2, 'merges.txt'), os.path.join(args.save_dir, 'merges.txt')))
# Save other files
os.system('cp {} {}'.format(os.path.join(args.tokenizer1, 'special_tokens_map.json'), args.save_dir))
os.system('cp {} {}'.format(os.path.join(args.tokenizer1, 'tokenizer_config.json'), args.save_dir))
# Instantiate the new tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.save_dir, use_fast=True)
def main():
parser = argparse.ArgumentParser()
# Dataset Arguments
parser.add_argument("--tokenizer1", type=str, required=True, help="")
parser.add_argument("--tokenizer2", type=str, required=True, help="")
parser.add_argument("--save_dir", type=str, required=True, help="")
args = parser.parse_args()
if __name__ == '__main__':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment