Skip to content

Instantly share code, notes, and snippets.

@TerenceLiu98
Created February 17, 2025 15:14
Show Gist options
  • Save TerenceLiu98/413ce4a6cd8365f2ab38cff559b4f009 to your computer and use it in GitHub Desktop.
Save TerenceLiu98/413ce4a6cd8365f2ab38cff559b4f009 to your computer and use it in GitHub Desktop.
import concurrent.futures
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from datasets import load_dataset
from transformers import AutoTokenizer
data_path = "/data/"
model_path = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_path)
train_list = [x for x in Path(f"{data_path}").glob("*_train.jsonl") if x.is_file()]
prefix_text = "You are an AI Assistant in Political Science and Linguistics."
def prompt(example):
return [
{"role": "system", "content": prefix_text + example["instruction"]},
{"role": "user", "content": example["input"]}
]
def process_file(file_path):
"""Loads a dataset and calculates token length for all samples."""
dataset = load_dataset("json", data_files={"data": str(file_path)}, split="data")
total_tokens = sum(len(tokenizer.apply_chat_template(prompt(dataset[i]))) for i in range(len(dataset)))
return str(file_path), total_tokens
# Use ThreadPoolExecutor for parallel execution
name, tokens = [], []
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: # Adjust max_workers as needed
results = list(tqdm(executor.map(process_file, train_list), total=len(train_list)))
# Unpacking results
name, tokens = zip(*results)
# Convert to DataFrame
train = pd.DataFrame({"train name": name, "tokens": tokens})
print(train)
print(f"total tokens: {sum(train["tokens"])}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment