iamlemec · February 22, 2024 18:38
diff --git a/check_tokenizer.py b/check_tokenizer.py
 def check_tokenizer(mod_ll, mod_hf, data, max_rows=None):
    from llama_cpp import Llama
    from transformers import AutoTokenizer
    from Levenshtein import editops
    from termcolor import cprint

    # load models
    if type(mod_ll) is str:
        mod_ll = Llama(mod_ll, verbose=False)
    if type(mod_hf) is str:
        mod_hf = AutoTokenizer.from_pretrained(mod_hf)

    # load data
    if type(data) is str:
        data = open(data).read().splitlines()
    if max_rows is not None:
        data = data[:max_rows]

    # compute token ids
    ids_ll = [mod_ll.tokenize(text.encode('utf-8')) for text in data]
    ids_st = [mod_hf.encode(text) for text in data]

    def tokmap(i, replace=False):
        tok = mod_hf._tokenizer.id_to_token(i)
        if tok.startswith('##'):
            return tok[2:]
        else:
            pre = '_' if replace else ' '
            return f'{pre}{tok}'

    # compare token ids
    for i, (id_ll, id_st) in enumerate(zip(ids_ll, ids_st)):
        if id_ll != id_st:
            print(f'Mismatch at index {i}')
            ops = {
                i1: (op, i2) for op, i1, i2 in editops(id_ll, id_st)
            }
            for pos1, id1 in enumerate(id_ll):
                if pos1 in ops:
                    op, pos2 = ops[pos1]
                    id2 = id_st[pos2]
                    tok1 = tokmap(id1, replace=True)
                    tok2 = tokmap(id2, replace=True)
                    if op == 'insert':
                        cprint(f'[+{tok1}]', color='green', attrs=['bold'], end='')
                    elif op == 'delete':
                        cprint(f'[-{tok1}]', color='red', attrs=['bold'], end='')
                    elif op == 'replace':
                        print('[', end='')
                        cprint(f'{tok1}', color='red', attrs=['bold'], end='')
                        cprint(f'→{tok2}', color='green', attrs=['bold'], end='')
                        print(']', end='')
                else:
                    tok1 = tokmap(id1, replace=False)
                    print(tok1, end='')
            print('\n')
	def check_tokenizer(mod_ll, mod_hf, data, max_rows=None):
	from llama_cpp import Llama
	from transformers import AutoTokenizer
	from Levenshtein import editops
	from termcolor import cprint

	# load models
	if type(mod_ll) is str:
	mod_ll = Llama(mod_ll, verbose=False)
	if type(mod_hf) is str:
	mod_hf = AutoTokenizer.from_pretrained(mod_hf)

	# load data
	if type(data) is str:
	data = open(data).read().splitlines()
	if max_rows is not None:
	data = data[:max_rows]

	# compute token ids
	ids_ll = [mod_ll.tokenize(text.encode('utf-8')) for text in data]
	ids_st = [mod_hf.encode(text) for text in data]

	def tokmap(i, replace=False):
	tok = mod_hf._tokenizer.id_to_token(i)
	if tok.startswith('##'):
	return tok[2:]
	else:
	pre = '_' if replace else ' '
	return f'{pre}{tok}'

	# compare token ids
	for i, (id_ll, id_st) in enumerate(zip(ids_ll, ids_st)):
	if id_ll != id_st:
	print(f'Mismatch at index {i}')
	ops = {
	i1: (op, i2) for op, i1, i2 in editops(id_ll, id_st)
	}
	for pos1, id1 in enumerate(id_ll):
	if pos1 in ops:
	op, pos2 = ops[pos1]
	id2 = id_st[pos2]
	tok1 = tokmap(id1, replace=True)
	tok2 = tokmap(id2, replace=True)
	if op == 'insert':
	cprint(f'[+{tok1}]', color='green', attrs=['bold'], end='')
	elif op == 'delete':
	cprint(f'[-{tok1}]', color='red', attrs=['bold'], end='')
	elif op == 'replace':
	print('[', end='')
	cprint(f'{tok1}', color='red', attrs=['bold'], end='')
	cprint(f'→{tok2}', color='green', attrs=['bold'], end='')
	print(']', end='')
	else:
	tok1 = tokmap(id1, replace=False)
	print(tok1, end='')
	print('\n')
No results found