ochafik · February 22, 2024 14:38
diff --git a/gistfile1.txt b/gistfile1.txt
 #
 # Example for Gemma 7b:
 #   wget -O vocab.json https://huggingface.co/google/gemma-7b/resolve/main/tokenizer.json?download=true
 #   python vocab_scavenge.py
 #   Matched 72378 / 256000 tokens
 #

 import json, sys, re

 [file, pattern] = \
 	[
 		'tokenizer.json',
 		r'^([a-zA-Z0-9_{}\[\]:"\'\n\t -]+)$'
 	] if len(sys.argv) == 1 \
 	else sys.argv[1:] 

 with open(file, 'r') as f:
 	tokenizer = json.load(f)
 added_tokens = set(t["id"] for t in tokenizer['added_tokens'])
 model = tokenizer['model']
 vocab = model['vocab']

 "<0x00>"
 rx = re.compile(pattern)
 byte_rx = re.compile(r'^<0x[0-9A-F]{2}>$')

 n_matches = 0
 for token, id in vocab.items():
 	if rx.match(token) or byte_rx.match(token) or id in added_tokens:
 		if id < 400:
 			print(f'"{token}": {id}')
 		n_matches += 1

 print(f'Matched {n_matches} / {len(vocab)} tokens')
	#
	# Example for Gemma 7b:
	# wget -O vocab.json https://huggingface.co/google/gemma-7b/resolve/main/tokenizer.json?download=true
	# python vocab_scavenge.py
	# Matched 72378 / 256000 tokens
	#

	import json, sys, re

	[file, pattern] = \
	[
	'tokenizer.json',
	r'^([a-zA-Z0-9_{}\[\]:"\'\n\t -]+)$'
	] if len(sys.argv) == 1 \
	else sys.argv[1:]

	with open(file, 'r') as f:
	tokenizer = json.load(f)
	added_tokens = set(t["id"] for t in tokenizer['added_tokens'])
	model = tokenizer['model']
	vocab = model['vocab']

	"<0x00>"
	rx = re.compile(pattern)
	byte_rx = re.compile(r'^<0x[0-9A-F]{2}>$')

	n_matches = 0
	for token, id in vocab.items():
	if rx.match(token) or byte_rx.match(token) or id in added_tokens:
	if id < 400:
	print(f'"{token}": {id}')
	n_matches += 1

	print(f'Matched {n_matches} / {len(vocab)} tokens')
No results found