Created
February 11, 2024 11:02
-
-
Save pszemraj/3246ff18c7f5c0a58b401c8762aa3580 to your computer and use it in GitHub Desktop.
update tokenizer.json to use postprocessing similar to BERT's
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import json | |
def update_tokenizer_post_processor(input_path): | |
""" | |
Load a tokenizer configuration from the input path, update its post_processor | |
with a custom TemplateProcessing configuration, and overwrite the original file. | |
Args: | |
- input_path (str): Path to the tokenizer.json file. | |
""" | |
# Convert input_path to a Path object for easier handling | |
tokenizer_path = Path(input_path) | |
if tokenizer_path.is_dir(): | |
tokenizer_path = tokenizer_path / "tokenizer.json" | |
# Check if the file exists | |
if not tokenizer_path.exists(): | |
print(f"The file at {input_path} does not exist.") | |
return | |
# Load the existing tokenizer configuration | |
with tokenizer_path.open("r", encoding="utf-8") as file: | |
tokenizer_config = json.load(file) | |
# Define the new post_processor configuration | |
custom_post_processor = { | |
"type": "TemplateProcessing", | |
"single": [ | |
{"SpecialToken": {"id": "<s>", "type_id": 0}}, | |
{"Sequence": {"id": "A", "type_id": 0}}, | |
{"SpecialToken": {"id": "</s>", "type_id": 0}}, | |
], | |
"pair": [ | |
{"SpecialToken": {"id": "<s>", "type_id": 0}}, | |
{"Sequence": {"id": "A", "type_id": 0}}, | |
{"SpecialToken": {"id": "</s>", "type_id": 0}}, | |
{"Sequence": {"id": "B", "type_id": 1}}, | |
{"SpecialToken": {"id": "</s>", "type_id": 1}}, | |
], | |
"special_tokens": { | |
"<s>": {"id": "<s>", "ids": [0], "tokens": ["<s>"]}, | |
"</s>": {"id": "</s>", "ids": [2], "tokens": ["</s>"]}, | |
}, | |
} | |
# Update the tokenizer configuration | |
tokenizer_config["post_processor"] = custom_post_processor | |
# Overwrite the original file with the updated configuration | |
with tokenizer_path.open("w", encoding="utf-8") as file: | |
json.dump(tokenizer_config, file, ensure_ascii=False, indent=4) | |
print(f"Tokenizer configuration at {input_path} has been updated.") | |
# Example usage: | |
update_tokenizer_post_processor("path/to/your/tokenizer.json") | |
# Note: Uncomment and replace the path with your actual tokenizer.json file path to use the function. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment