Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Norod/7379927a41fc37a448ca5433beec0061 to your computer and use it in GitHub Desktop.
Save Norod/7379927a41fc37a448ca5433beec0061 to your computer and use it in GitHub Desktop.
Create a JSONL dataset by reading and processes lines from text files, concatenating a specified number of text lines into a single JSONL line, encoding new lines as \\n and allowing UTF-8 unicode characters
import os
import json
from glob import glob
from torch.utils.data import IterableDataset, DataLoader
class BatchProcessedDataset(IterableDataset):
"""
A dataset which streams and processes lines from files, concatenating a specified number of lines.
"""
def __init__(self, files, batch_size=4096, lines_per_entry=20):
"""
Constructor
Arguments:
----------
files: list[str]
A list of files from which lines are streamed in order
batch_size:
The batch size of lines that is passed to the tokenizer
lines_per_entry:
Number of lines to concatenate into a single JSON entry
"""
self.files = files
self.batch_size = batch_size
self.lines_per_entry = lines_per_entry
def __nextbatch(self, f):
"""Read and concatenate specified number of lines."""
lines = []
for _, line in zip(range(self.lines_per_entry), f):
if line:
lines.append(line.replace("\n", "\\n"))
return "".join(lines)
def __iter__(self):
for file_path in self.files:
with open(file_path, encoding='utf-8') as f:
while True:
batch = self.__nextbatch(f)
if not batch:
break
yield batch
def write_to_jsonl(dataset, output_file):
with open(output_file, 'w', encoding='utf-8') as out_file:
for entry in dataset:
json_line = json.dumps({"text": entry}, ensure_ascii=False)
out_file.write(json_line + '\n')
def main(input_folder, output_file, batch_size=4096, lines_per_entry=20):
# Get list of text files in the input folder
text_files = glob(os.path.join(input_folder, "*.txt"))
# Initialize the dataset and dataloader
dataset = BatchProcessedDataset(text_files, batch_size, lines_per_entry)
dataloader = DataLoader(dataset, batch_size=None)
# Write the processed lines to the output JSONL file
write_to_jsonl(dataloader, output_file)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Process text files and output JSONL formatted file with newlines encoded.")
parser.add_argument("input_folder", type=str, help="Folder containing text files to process")
parser.add_argument("output_file", type=str, help="Output JSONL formatted file")
parser.add_argument("--batch_size", type=int, default=4096, help="Batch size for processing lines (default: 4096)")
parser.add_argument("--lines_per_entry", type=int, default=20, help="Number of lines to concatenate into a single JSON entry (default: 20)")
args = parser.parse_args()
main(args.input_folder, args.output_file, args.batch_size, args.lines_per_entry)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment