Skip to content

Instantly share code, notes, and snippets.

@prideout
Created October 15, 2024 12:38
Show Gist options
  • Save prideout/292c62334d59875cb3507782bc28c122 to your computer and use it in GitHub Desktop.
Save prideout/292c62334d59875cb3507782bc28c122 to your computer and use it in GitHub Desktop.
repair tokenizer
import json
# Define the input and output file paths
input_file = 'fused_model/tokenizer.json'
output_file = 'fused_model/repaired_tokenizer.json'
# Read the JSON file containing the entire structure
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# Access the array in "merges" within the "model" key
merges_array = data.get('model', {}).get('merges', [])
# Convert the array of 2-tuples into an array of strings (if it's a list of 2-tuples)
joined_strings = [' '.join(tup) for tup in merges_array if isinstance(tup, list) and len(tup) == 2]
# Replace the original "merges" array with the joined strings array
if 'model' in data:
data['model']['merges'] = joined_strings
# Write the modified JSON data back to a new file, preserving Unicode characters
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"Successfully modified the 'merges' array and created {output_file}.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment