kalomaze · January 20, 2025 22:22
diff --git a/qwen_tokenize_test.py b/qwen_tokenize_test.py
 from transformers import AutoTokenizer
 from huggingface_hub import snapshot_download
 import os

 def add_token_boundaries(tokenizer, tokens):
    """Add brackets around token boundaries"""
    text = ""
    for token in tokens:
        decoded = tokenizer.decode([token])
        text += f"[{decoded}] "
    return text.strip()

 def main():
    # Define repository ID and local directory
    repo_id = "Qwen/Qwen2.5-7B"
    local_dir = "qwen_7b"

    # Check if directory already exists
    if not os.path.exists(local_dir):
        print(f"Downloading {repo_id} to {local_dir}...")
        snapshot_download(
            repo_id,
            local_dir=local_dir,
            ignore_patterns=["*.safetensors", "*.bin"],  # Ignore large model files
        )
    else:
        print(f"Directory {local_dir} already exists")

    # Load tokenizer
    try:
        tokenizer = AutoTokenizer.from_pretrained(local_dir, trust_remote_code=True)
        print("Tokenizer loaded successfully")

        # Test text to tokenize
        test_text = """You might think you've peeped the scene
 You haven't, the real one's far too mean
 The watered-down one, the one you know
 Was made up centuries ago
 They made it sound all wack and corny
 Yes, it's awful blasted boring
 Twisted fictions, sick addiction
 Well, gather 'round, children, zip it, listen"""

        # Tokenize the text
        tokens = tokenizer.encode(test_text)
        
        # Get decoded text with boundaries
        decoded_with_boundaries = add_token_boundaries(tokenizer, tokens)

        print("\nTokenization results:")
        print(f"Number of tokens: {len(tokens)}")
        print("\nOriginal text:")
        print(test_text)
        print("\nDecoded text with token boundaries:")
        print(decoded_with_boundaries)
        
        # Print token IDs for reference
        print("\nToken IDs:")
        print(tokens)

    except Exception as e:
        print(f"Error loading tokenizer: {str(e)}")

 if __name__ == "__main__":
    main()
	from transformers import AutoTokenizer
	from huggingface_hub import snapshot_download
	import os

	def add_token_boundaries(tokenizer, tokens):
	"""Add brackets around token boundaries"""
	text = ""
	for token in tokens:
	decoded = tokenizer.decode([token])
	text += f"[{decoded}] "
	return text.strip()

	def main():
	# Define repository ID and local directory
	repo_id = "Qwen/Qwen2.5-7B"
	local_dir = "qwen_7b"

	# Check if directory already exists
	if not os.path.exists(local_dir):
	print(f"Downloading {repo_id} to {local_dir}...")
	snapshot_download(
	repo_id,
	local_dir=local_dir,
	ignore_patterns=[".safetensors", ".bin"], # Ignore large model files
	)
	else:
	print(f"Directory {local_dir} already exists")

	# Load tokenizer
	try:
	tokenizer = AutoTokenizer.from_pretrained(local_dir, trust_remote_code=True)
	print("Tokenizer loaded successfully")

	# Test text to tokenize
	test_text = """You might think you've peeped the scene
	You haven't, the real one's far too mean
	The watered-down one, the one you know
	Was made up centuries ago
	They made it sound all wack and corny
	Yes, it's awful blasted boring
	Twisted fictions, sick addiction
	Well, gather 'round, children, zip it, listen"""

	# Tokenize the text
	tokens = tokenizer.encode(test_text)

	# Get decoded text with boundaries
	decoded_with_boundaries = add_token_boundaries(tokenizer, tokens)

	print("\nTokenization results:")
	print(f"Number of tokens: {len(tokens)}")
	print("\nOriginal text:")
	print(test_text)
	print("\nDecoded text with token boundaries:")
	print(decoded_with_boundaries)

	# Print token IDs for reference
	print("\nToken IDs:")
	print(tokens)

	except Exception as e:
	print(f"Error loading tokenizer: {str(e)}")

	if __name__ == "__main__":
	main()