Last active
April 28, 2024 07:40
-
-
Save sagorbrur/12f71d3c0271692db4bd7c5ac7bd4b1e to your computer and use it in GitHub Desktop.
Token counts using hf tokenizer and large datasets
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import json | |
import multiprocessing | |
from tqdm import tqdm | |
from transformers import AutoTokenizer | |
model_id = "tokenzer_model" | |
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) | |
def token_num(file): | |
file_tokens = 0 | |
for line in open(file): | |
data = json.loads(line) | |
text = data['text'] | |
tokens = tokenizer.tokenize(text) | |
file_tokens += len(tokens) | |
print(f"file tokens: {file_tokens}") | |
return file_tokens | |
files = glob.glob('/data2/mybndatasets/splited_data/train/*.jsonl') | |
print(len(files)) | |
total_tokens = 0 | |
with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool: | |
for tokens in tqdm(pool.imap(token_num, files), total=len(files)): | |
total_tokens += tokens | |
print(f"total tokens: {total_tokens}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment