Norod · July 11, 2024 15:33
diff --git a/prepare_jsonl_dataset_file_from_txt_folder.py b/prepare_jsonl_dataset_file_from_txt_folder.py
 import os
 import json
 from glob import glob
 from torch.utils.data import IterableDataset, DataLoader

 class BatchProcessedDataset(IterableDataset):
    """
    A dataset which streams and processes lines from files, concatenating a specified number of lines.
    """
    def __init__(self, files, batch_size=4096, lines_per_entry=20):
        """
        Constructor

        Arguments:
        ----------
        files: list[str]
            A list of files from which lines are streamed in order

        batch_size:
            The batch size of lines that is passed to the tokenizer

        lines_per_entry:
            Number of lines to concatenate into a single JSON entry
        """
        self.files = files
        self.batch_size = batch_size
        self.lines_per_entry = lines_per_entry

    def __nextbatch(self, f):
        """Read and concatenate specified number of lines."""
        lines = []
        for _, line in zip(range(self.lines_per_entry), f):
            if line:
                lines.append(line.replace("\n", "\\n"))
        return "".join(lines)

    def __iter__(self):
        for file_path in self.files:
            with open(file_path, encoding='utf-8') as f:
                while True:
                    batch = self.__nextbatch(f)
                    if not batch:
                        break
                    yield batch

 def write_to_jsonl(dataset, output_file):
    with open(output_file, 'w', encoding='utf-8') as out_file:
        for entry in dataset:
            json_line = json.dumps({"text": entry}, ensure_ascii=False)
            out_file.write(json_line + '\n')

 def main(input_folder, output_file, batch_size=4096, lines_per_entry=20):
    # Get list of text files in the input folder
    text_files = glob(os.path.join(input_folder, "*.txt"))
    
    # Initialize the dataset and dataloader
    dataset = BatchProcessedDataset(text_files, batch_size, lines_per_entry)
    dataloader = DataLoader(dataset, batch_size=None)

    # Write the processed lines to the output JSONL file
    write_to_jsonl(dataloader, output_file)

 if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Process text files and output JSONL formatted file with newlines encoded.")
    parser.add_argument("input_folder", type=str, help="Folder containing text files to process")
    parser.add_argument("output_file", type=str, help="Output JSONL formatted file")
    parser.add_argument("--batch_size", type=int, default=4096, help="Batch size for processing lines (default: 4096)")
    parser.add_argument("--lines_per_entry", type=int, default=20, help="Number of lines to concatenate into a single JSON entry (default: 20)")

    args = parser.parse_args()
    main(args.input_folder, args.output_file, args.batch_size, args.lines_per_entry)
	import os
	import json
	from glob import glob
	from torch.utils.data import IterableDataset, DataLoader

	class BatchProcessedDataset(IterableDataset):
	"""
	A dataset which streams and processes lines from files, concatenating a specified number of lines.
	"""
	def __init__(self, files, batch_size=4096, lines_per_entry=20):
	"""
	Constructor

	Arguments:
	----------
	files: list[str]
	A list of files from which lines are streamed in order

	batch_size:
	The batch size of lines that is passed to the tokenizer

	lines_per_entry:
	Number of lines to concatenate into a single JSON entry
	"""
	self.files = files
	self.batch_size = batch_size
	self.lines_per_entry = lines_per_entry

	def __nextbatch(self, f):
	"""Read and concatenate specified number of lines."""
	lines = []
	for _, line in zip(range(self.lines_per_entry), f):
	if line:
	lines.append(line.replace("\n", "\\n"))
	return "".join(lines)

	def __iter__(self):
	for file_path in self.files:
	with open(file_path, encoding='utf-8') as f:
	while True:
	batch = self.__nextbatch(f)
	if not batch:
	break
	yield batch

	def write_to_jsonl(dataset, output_file):
	with open(output_file, 'w', encoding='utf-8') as out_file:
	for entry in dataset:
	json_line = json.dumps({"text": entry}, ensure_ascii=False)
	out_file.write(json_line + '\n')

	def main(input_folder, output_file, batch_size=4096, lines_per_entry=20):
	# Get list of text files in the input folder
	text_files = glob(os.path.join(input_folder, "*.txt"))

	# Initialize the dataset and dataloader
	dataset = BatchProcessedDataset(text_files, batch_size, lines_per_entry)
	dataloader = DataLoader(dataset, batch_size=None)

	# Write the processed lines to the output JSONL file
	write_to_jsonl(dataloader, output_file)

	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(description="Process text files and output JSONL formatted file with newlines encoded.")
	parser.add_argument("input_folder", type=str, help="Folder containing text files to process")
	parser.add_argument("output_file", type=str, help="Output JSONL formatted file")
	parser.add_argument("--batch_size", type=int, default=4096, help="Batch size for processing lines (default: 4096)")
	parser.add_argument("--lines_per_entry", type=int, default=20, help="Number of lines to concatenate into a single JSON entry (default: 20)")

	args = parser.parse_args()
	main(args.input_folder, args.output_file, args.batch_size, args.lines_per_entry)