Created
June 30, 2024 02:26
-
-
Save victorchall/697c031771e77f820e4d7030097b472a to your computer and use it in GitHub Desktop.
Python script to create 1GB tar files for webp/txt/json tuple dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import tarfile | |
import json | |
from collections import defaultdict | |
def group_files(root_dir): | |
file_groups = defaultdict(list) | |
for subdir, _, files in os.walk(root_dir): | |
for file in files: | |
if file == "urls.txt" or file == "caption_cog_params.txt": | |
continue | |
if file.endswith(('.webp', '.txt', '.json')): | |
basename = os.path.splitext(file)[0] | |
file_groups[basename].append(os.path.join(subdir, file)) | |
return file_groups | |
def create_tar_files(file_groups, output_dir, max_size=1024*1024*1024): # 1 GB | |
current_tar = None | |
current_size = 0 | |
tar_count = 1 | |
for basename, files in file_groups.items(): | |
file_size = sum(os.path.getsize(f) for f in files) | |
if current_tar is None or current_size + file_size > max_size: | |
if current_tar: | |
current_tar.close() | |
tar_filename = os.path.join(output_dir, f'{tar_count:05d}.tar') | |
current_tar = tarfile.open(tar_filename, 'w') | |
current_size = 0 | |
tar_count += 1 | |
for file in files: | |
arcname = os.path.basename(file) | |
current_tar.add(file, arcname=arcname) | |
current_size += os.path.getsize(file) | |
if current_tar: | |
current_tar.close() | |
def check_filegroups(file_groups): | |
i = 0 | |
for group in file_groups.keys(): | |
i += 1 | |
assert any(x.endswith(".txt") for x in file_groups[group]), f"Missing .txt for {group}" | |
assert any(x.endswith(".webp") for x in file_groups[group]), f"Missing .txt for {group}" | |
assert any(x.endswith(".json") for x in file_groups[group]), f"Missing .txt for {group}" | |
print("all file groups checked out ok") | |
print(f"total: {i}") | |
def main(): | |
root_dir = input("Enter the root directory path: ") | |
output_dir = input("Enter the output directory path for TAR files: ") | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir) | |
file_groups = group_files(root_dir) | |
check_filegroups(file_groups) | |
create_tar_files(file_groups, output_dir) | |
print("TAR files have been created successfully.") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment