Created
October 15, 2024 12:38
-
-
Save prideout/292c62334d59875cb3507782bc28c122 to your computer and use it in GitHub Desktop.
repair tokenizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
# Define the input and output file paths | |
input_file = 'fused_model/tokenizer.json' | |
output_file = 'fused_model/repaired_tokenizer.json' | |
# Read the JSON file containing the entire structure | |
with open(input_file, 'r', encoding='utf-8') as f: | |
data = json.load(f) | |
# Access the array in "merges" within the "model" key | |
merges_array = data.get('model', {}).get('merges', []) | |
# Convert the array of 2-tuples into an array of strings (if it's a list of 2-tuples) | |
joined_strings = [' '.join(tup) for tup in merges_array if isinstance(tup, list) and len(tup) == 2] | |
# Replace the original "merges" array with the joined strings array | |
if 'model' in data: | |
data['model']['merges'] = joined_strings | |
# Write the modified JSON data back to a new file, preserving Unicode characters | |
with open(output_file, 'w', encoding='utf-8') as f: | |
json.dump(data, f, ensure_ascii=False, indent=2) | |
print(f"Successfully modified the 'merges' array and created {output_file}.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment