pszemraj · February 11, 2024 11:02
diff --git a/update_tokenizer_processing.py b/update_tokenizer_processing.py
 from pathlib import Path
 import json


 def update_tokenizer_post_processor(input_path):
    """
    Load a tokenizer configuration from the input path, update its post_processor
    with a custom TemplateProcessing configuration, and overwrite the original file.

    Args:
    - input_path (str): Path to the tokenizer.json file.
    """
    # Convert input_path to a Path object for easier handling
    tokenizer_path = Path(input_path)
    if tokenizer_path.is_dir():
        tokenizer_path = tokenizer_path / "tokenizer.json"

    # Check if the file exists
    if not tokenizer_path.exists():
        print(f"The file at {input_path} does not exist.")
        return

    # Load the existing tokenizer configuration
    with tokenizer_path.open("r", encoding="utf-8") as file:
        tokenizer_config = json.load(file)

    # Define the new post_processor configuration
    custom_post_processor = {
        "type": "TemplateProcessing",
        "single": [
            {"SpecialToken": {"id": "<s>", "type_id": 0}},
            {"Sequence": {"id": "A", "type_id": 0}},
            {"SpecialToken": {"id": "</s>", "type_id": 0}},
        ],
        "pair": [
            {"SpecialToken": {"id": "<s>", "type_id": 0}},
            {"Sequence": {"id": "A", "type_id": 0}},
            {"SpecialToken": {"id": "</s>", "type_id": 0}},
            {"Sequence": {"id": "B", "type_id": 1}},
            {"SpecialToken": {"id": "</s>", "type_id": 1}},
        ],
        "special_tokens": {
            "<s>": {"id": "<s>", "ids": [0], "tokens": ["<s>"]},
            "</s>": {"id": "</s>", "ids": [2], "tokens": ["</s>"]},
        },
    }

    # Update the tokenizer configuration
    tokenizer_config["post_processor"] = custom_post_processor

    # Overwrite the original file with the updated configuration
    with tokenizer_path.open("w", encoding="utf-8") as file:
        json.dump(tokenizer_config, file, ensure_ascii=False, indent=4)

    print(f"Tokenizer configuration at {input_path} has been updated.")


 # Example usage:
 update_tokenizer_post_processor("path/to/your/tokenizer.json")
 # Note: Uncomment and replace the path with your actual tokenizer.json file path to use the function.
	from pathlib import Path
	import json


	def update_tokenizer_post_processor(input_path):
	"""
	Load a tokenizer configuration from the input path, update its post_processor
	with a custom TemplateProcessing configuration, and overwrite the original file.

	Args:
	- input_path (str): Path to the tokenizer.json file.
	"""
	# Convert input_path to a Path object for easier handling
	tokenizer_path = Path(input_path)
	if tokenizer_path.is_dir():
	tokenizer_path = tokenizer_path / "tokenizer.json"

	# Check if the file exists
	if not tokenizer_path.exists():
	print(f"The file at {input_path} does not exist.")
	return

	# Load the existing tokenizer configuration
	with tokenizer_path.open("r", encoding="utf-8") as file:
	tokenizer_config = json.load(file)

	# Define the new post_processor configuration
	custom_post_processor = {
	"type": "TemplateProcessing",
	"single": [
	{"SpecialToken": {"id": "<s>", "type_id": 0}},
	{"Sequence": {"id": "A", "type_id": 0}},
	{"SpecialToken": {"id": "</s>", "type_id": 0}},
	],
	"pair": [
	{"SpecialToken": {"id": "<s>", "type_id": 0}},
	{"Sequence": {"id": "A", "type_id": 0}},
	{"SpecialToken": {"id": "</s>", "type_id": 0}},
	{"Sequence": {"id": "B", "type_id": 1}},
	{"SpecialToken": {"id": "</s>", "type_id": 1}},
	],
	"special_tokens": {
	"<s>": {"id": "<s>", "ids": [0], "tokens": ["<s>"]},
	"</s>": {"id": "</s>", "ids": [2], "tokens": ["</s>"]},
	},
	}

	# Update the tokenizer configuration
	tokenizer_config["post_processor"] = custom_post_processor

	# Overwrite the original file with the updated configuration
	with tokenizer_path.open("w", encoding="utf-8") as file:
	json.dump(tokenizer_config, file, ensure_ascii=False, indent=4)

	print(f"Tokenizer configuration at {input_path} has been updated.")


	# Example usage:
	update_tokenizer_post_processor("path/to/your/tokenizer.json")
	# Note: Uncomment and replace the path with your actual tokenizer.json file path to use the function.