Skip to content

Instantly share code, notes, and snippets.

@sagorbrur
Last active April 28, 2024 07:40
Show Gist options
  • Save sagorbrur/12f71d3c0271692db4bd7c5ac7bd4b1e to your computer and use it in GitHub Desktop.
Save sagorbrur/12f71d3c0271692db4bd7c5ac7bd4b1e to your computer and use it in GitHub Desktop.
Token counts using hf tokenizer and large datasets
import glob
import json
import multiprocessing
from tqdm import tqdm
from transformers import AutoTokenizer
model_id = "tokenzer_model"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
def token_num(file):
file_tokens = 0
for line in open(file):
data = json.loads(line)
text = data['text']
tokens = tokenizer.tokenize(text)
file_tokens += len(tokens)
print(f"file tokens: {file_tokens}")
return file_tokens
files = glob.glob('/data2/mybndatasets/splited_data/train/*.jsonl')
print(len(files))
total_tokens = 0
with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
for tokens in tqdm(pool.imap(token_num, files), total=len(files)):
total_tokens += tokens
print(f"total tokens: {total_tokens}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment