Skip to content

Instantly share code, notes, and snippets.

@kalomaze
Created January 20, 2025 22:22
Show Gist options
  • Save kalomaze/29f6c7ed0dbb996fad34445319cfde77 to your computer and use it in GitHub Desktop.
Save kalomaze/29f6c7ed0dbb996fad34445319cfde77 to your computer and use it in GitHub Desktop.
qwen tokenizer test
from transformers import AutoTokenizer
from huggingface_hub import snapshot_download
import os
def add_token_boundaries(tokenizer, tokens):
"""Add brackets around token boundaries"""
text = ""
for token in tokens:
decoded = tokenizer.decode([token])
text += f"[{decoded}] "
return text.strip()
def main():
# Define repository ID and local directory
repo_id = "Qwen/Qwen2.5-7B"
local_dir = "qwen_7b"
# Check if directory already exists
if not os.path.exists(local_dir):
print(f"Downloading {repo_id} to {local_dir}...")
snapshot_download(
repo_id,
local_dir=local_dir,
ignore_patterns=["*.safetensors", "*.bin"], # Ignore large model files
)
else:
print(f"Directory {local_dir} already exists")
# Load tokenizer
try:
tokenizer = AutoTokenizer.from_pretrained(local_dir, trust_remote_code=True)
print("Tokenizer loaded successfully")
# Test text to tokenize
test_text = """You might think you've peeped the scene
You haven't, the real one's far too mean
The watered-down one, the one you know
Was made up centuries ago
They made it sound all wack and corny
Yes, it's awful blasted boring
Twisted fictions, sick addiction
Well, gather 'round, children, zip it, listen"""
# Tokenize the text
tokens = tokenizer.encode(test_text)
# Get decoded text with boundaries
decoded_with_boundaries = add_token_boundaries(tokenizer, tokens)
print("\nTokenization results:")
print(f"Number of tokens: {len(tokens)}")
print("\nOriginal text:")
print(test_text)
print("\nDecoded text with token boundaries:")
print(decoded_with_boundaries)
# Print token IDs for reference
print("\nToken IDs:")
print(tokens)
except Exception as e:
print(f"Error loading tokenizer: {str(e)}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment