Created
July 18, 2024 13:02
-
-
Save Norod/c3888a0d26daa499ea62e254bb5c08b6 to your computer and use it in GitHub Desktop.
A set of scripts for: training a small tokenizer in a new language, merging small tokinizer with existing one and saving the combined and resized model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Given two tokenizers, combine them and create a new tokenizer | |
Usage: python combine_tokenizers.py --tokenizer1 ../config/en/roberta_8 --tokenizer2 ../config/hi/roberta_8 --save_dir ../config/en/en_hi/roberta_8 | |
Source: https://github.com/huggingface/tokenizers/issues/690#issuecomment-830665989 | |
""" | |
# Libraries for tokenizer | |
from pathlib import Path | |
from tokenizers import ByteLevelBPETokenizer | |
import argparse | |
import json | |
import os | |
from tqdm import tqdm | |
from transformers import AutoTokenizer | |
from timeit import default_timer as timer | |
import sys | |
def combine_tokenizers(args): | |
# Load both the json files, take the union, and store it | |
json1 = json.load(open(os.path.join(args.tokenizer1, 'vocab.json'))) | |
json2 = json.load(open(os.path.join(args.tokenizer2, 'vocab.json'))) | |
# Create a new vocabulary | |
new_vocab = {} | |
idx = 0 | |
for word in json1.keys(): | |
if word not in new_vocab.keys(): | |
new_vocab[word] = idx | |
idx += 1 | |
# Add words from second tokenizer | |
for word in json2.keys(): | |
if word not in new_vocab.keys(): | |
new_vocab[word] = idx | |
idx += 1 | |
# Make the directory if necessary | |
if not os.path.exists(args.save_dir): | |
os.makedirs(args.save_dir) | |
# Save the vocab | |
with open(os.path.join(args.save_dir, 'vocab.json'), 'w') as fp: | |
json.dump(new_vocab, fp, ensure_ascii=False) | |
# Merge the two merges file. Don't handle duplicates here | |
# Concatenate them, but ignore the first line of the second file | |
os.system('cat {} > {}'.format(os.path.join(args.tokenizer1, 'merges.txt'), os.path.join(args.save_dir, 'merges.txt'))) | |
os.system('tail -n +2 -q {} >> {}'.format(os.path.join(args.tokenizer2, 'merges.txt'), os.path.join(args.save_dir, 'merges.txt'))) | |
# Save other files | |
os.system('cp {} {}'.format(os.path.join(args.tokenizer1, 'special_tokens_map.json'), args.save_dir)) | |
os.system('cp {} {}'.format(os.path.join(args.tokenizer1, 'tokenizer_config.json'), args.save_dir)) | |
# Instantiate the new tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(args.save_dir, use_fast=True) | |
tokenizer.save_pretrained(args.save_dir+'/tokenizer') | |
def main(): | |
parser = argparse.ArgumentParser() | |
# Dataset Arguments | |
parser.add_argument("--tokenizer1", type=str, required=True, help="") | |
parser.add_argument("--tokenizer2", type=str, required=True, help="") | |
parser.add_argument("--save_dir", type=str, required=True, help="") | |
args = parser.parse_args() | |
combine_tokenizers(args) | |
if __name__ == '__main__': | |
main() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
device = "cpu" # for GPU usage or "cpu" for CPU usage | |
tokenizer = AutoTokenizer.from_pretrained("./SmolLM-tokenizer-with-added-hebrew-14k") | |
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")` | |
model = AutoModelForCausalLM.from_pretrained("./SmolLM-135M").to(device) | |
model.resize_token_embeddings(len(tokenizer)) | |
inputs = tokenizer.encode("def print_hello_world():", return_tensors="pt").to(device) | |
outputs = model.generate(inputs) | |
print(tokenizer.decode(outputs[0])) | |
model.save_pretrained("./Heb-SmolLM-135M") | |
tokenizer.save_pretrained("./Heb-SmolLM-135M") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment