Created
July 25, 2024 15:28
-
-
Save Norod/9f6af9042be4fbc377dc220674c93cf4 to your computer and use it in GitHub Desktop.
Given two BPE tokenizers, combine them and create a new tokenizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Given two tokenizers, combine them and create a new tokenizer | |
Usage: python combine_tokenizers.py --tokenizer1 ./SmolLM-135M --tokenizer2 ./hebrew-14k --save_dir ./combined | |
Source: https://github.com/huggingface/tokenizers/issues/690#issuecomment-830665989 | |
""" | |
# Libraries for tokenizer | |
from pathlib import Path | |
from tokenizers import ByteLevelBPETokenizer | |
import argparse | |
import json | |
import os | |
from tqdm import tqdm | |
from transformers import AutoTokenizer | |
from timeit import default_timer as timer | |
import sys | |
def combine_tokenizers(args): | |
# Load both the json files, take the union, and store it | |
json1 = json.load(open(os.path.join(args.tokenizer1, 'vocab.json'))) | |
json2 = json.load(open(os.path.join(args.tokenizer2, 'vocab.json'))) | |
# Create a new vocabulary | |
new_vocab = {} | |
idx = 0 | |
for word in json1.keys(): | |
if word not in new_vocab.keys(): | |
new_vocab[word] = idx | |
idx += 1 | |
# Add words from second tokenizer | |
for word in json2.keys(): | |
if word not in new_vocab.keys(): | |
new_vocab[word] = idx | |
idx += 1 | |
# Make the directory if necessary | |
if not os.path.exists(args.save_dir): | |
os.makedirs(args.save_dir) | |
# Save the vocab | |
with open(os.path.join(args.save_dir, 'vocab.json'), 'w') as fp: | |
json.dump(new_vocab, fp, ensure_ascii=False) | |
# Merge the two merges file. Don't handle duplicates here | |
# Concatenate them, but ignore the first line of the second file | |
os.system('cat {} > {}'.format(os.path.join(args.tokenizer1, 'merges.txt'), os.path.join(args.save_dir, 'merges.txt'))) | |
os.system('tail -n +2 -q {} >> {}'.format(os.path.join(args.tokenizer2, 'merges.txt'), os.path.join(args.save_dir, 'merges.txt'))) | |
# Save other files | |
os.system('cp {} {}'.format(os.path.join(args.tokenizer1, 'special_tokens_map.json'), args.save_dir)) | |
os.system('cp {} {}'.format(os.path.join(args.tokenizer1, 'tokenizer_config.json'), args.save_dir)) | |
# Instantiate the new tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(args.save_dir, use_fast=True) | |
tokenizer.save_pretrained(args.save_dir+'/tokenizer') | |
def main(): | |
parser = argparse.ArgumentParser() | |
# Dataset Arguments | |
parser.add_argument("--tokenizer1", type=str, required=True, help="") | |
parser.add_argument("--tokenizer2", type=str, required=True, help="") | |
parser.add_argument("--save_dir", type=str, required=True, help="") | |
args = parser.parse_args() | |
combine_tokenizers(args) | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment