Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Created February 11, 2024 11:02
Show Gist options
  • Save pszemraj/3246ff18c7f5c0a58b401c8762aa3580 to your computer and use it in GitHub Desktop.
Save pszemraj/3246ff18c7f5c0a58b401c8762aa3580 to your computer and use it in GitHub Desktop.
update tokenizer.json to use postprocessing similar to BERT's
from pathlib import Path
import json
def update_tokenizer_post_processor(input_path):
"""
Load a tokenizer configuration from the input path, update its post_processor
with a custom TemplateProcessing configuration, and overwrite the original file.
Args:
- input_path (str): Path to the tokenizer.json file.
"""
# Convert input_path to a Path object for easier handling
tokenizer_path = Path(input_path)
if tokenizer_path.is_dir():
tokenizer_path = tokenizer_path / "tokenizer.json"
# Check if the file exists
if not tokenizer_path.exists():
print(f"The file at {input_path} does not exist.")
return
# Load the existing tokenizer configuration
with tokenizer_path.open("r", encoding="utf-8") as file:
tokenizer_config = json.load(file)
# Define the new post_processor configuration
custom_post_processor = {
"type": "TemplateProcessing",
"single": [
{"SpecialToken": {"id": "<s>", "type_id": 0}},
{"Sequence": {"id": "A", "type_id": 0}},
{"SpecialToken": {"id": "</s>", "type_id": 0}},
],
"pair": [
{"SpecialToken": {"id": "<s>", "type_id": 0}},
{"Sequence": {"id": "A", "type_id": 0}},
{"SpecialToken": {"id": "</s>", "type_id": 0}},
{"Sequence": {"id": "B", "type_id": 1}},
{"SpecialToken": {"id": "</s>", "type_id": 1}},
],
"special_tokens": {
"<s>": {"id": "<s>", "ids": [0], "tokens": ["<s>"]},
"</s>": {"id": "</s>", "ids": [2], "tokens": ["</s>"]},
},
}
# Update the tokenizer configuration
tokenizer_config["post_processor"] = custom_post_processor
# Overwrite the original file with the updated configuration
with tokenizer_path.open("w", encoding="utf-8") as file:
json.dump(tokenizer_config, file, ensure_ascii=False, indent=4)
print(f"Tokenizer configuration at {input_path} has been updated.")
# Example usage:
update_tokenizer_post_processor("path/to/your/tokenizer.json")
# Note: Uncomment and replace the path with your actual tokenizer.json file path to use the function.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment