Created
November 8, 2023 01:18
-
-
Save arcanite24/3c43a28ca259d8f2df07033b888a8641 to your computer and use it in GitHub Desktop.
Convert Kohya dataset to HF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
# Replace 'your_folder_path' with the path to the directory containing your .png and .txt files | |
folder_path = 'datasets/pixel-art-xl-lite-v1' | |
output_jsonl_filename = 'datasets/pixel-art-xl-lite-v1/metadata.jsonl' | |
# List all files in the directory | |
files = os.listdir(folder_path) | |
# Filter out .png and .txt files | |
png_files = [f for f in files if f.endswith('.png')] | |
txt_files = [f for f in files if f.endswith('.txt')] | |
# Create pairs | |
pairs = [] | |
for png_file in png_files: | |
# Find the corresponding .txt file | |
base_filename = os.path.splitext(png_file)[0] | |
txt_file = f"{base_filename}.txt" | |
if txt_file in txt_files: | |
pairs.append((png_file, txt_file)) | |
else: | |
print(f"No matching .txt file for image {png_file}") | |
# Process each pair and create a .jsonl file | |
with open(output_jsonl_filename, 'w') as jsonl_file: | |
for png_file, txt_file in pairs: | |
# Read the metadata from the .txt file | |
with open(os.path.join(folder_path, txt_file), 'r', encoding='utf-8') as f: | |
text_content = f.read().strip() # Read and strip the content of the .txt file | |
# Create a JSON object | |
json_obj = { | |
"file_name": png_file, | |
"text": text_content | |
} | |
# Write the JSON object to the .jsonl file | |
jsonl_file.write(json.dumps(json_obj) + '\n') | |
print(f"Metadata .jsonl file created: {output_jsonl_filename}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment