Skip to content

Instantly share code, notes, and snippets.

@arcanite24
Created November 8, 2023 01:18
Show Gist options
  • Save arcanite24/3c43a28ca259d8f2df07033b888a8641 to your computer and use it in GitHub Desktop.
Save arcanite24/3c43a28ca259d8f2df07033b888a8641 to your computer and use it in GitHub Desktop.
Convert Kohya dataset to HF
import json
import os
# Replace 'your_folder_path' with the path to the directory containing your .png and .txt files
folder_path = 'datasets/pixel-art-xl-lite-v1'
output_jsonl_filename = 'datasets/pixel-art-xl-lite-v1/metadata.jsonl'
# List all files in the directory
files = os.listdir(folder_path)
# Filter out .png and .txt files
png_files = [f for f in files if f.endswith('.png')]
txt_files = [f for f in files if f.endswith('.txt')]
# Create pairs
pairs = []
for png_file in png_files:
# Find the corresponding .txt file
base_filename = os.path.splitext(png_file)[0]
txt_file = f"{base_filename}.txt"
if txt_file in txt_files:
pairs.append((png_file, txt_file))
else:
print(f"No matching .txt file for image {png_file}")
# Process each pair and create a .jsonl file
with open(output_jsonl_filename, 'w') as jsonl_file:
for png_file, txt_file in pairs:
# Read the metadata from the .txt file
with open(os.path.join(folder_path, txt_file), 'r', encoding='utf-8') as f:
text_content = f.read().strip() # Read and strip the content of the .txt file
# Create a JSON object
json_obj = {
"file_name": png_file,
"text": text_content
}
# Write the JSON object to the .jsonl file
jsonl_file.write(json.dumps(json_obj) + '\n')
print(f"Metadata .jsonl file created: {output_jsonl_filename}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment