Skip to content

Instantly share code, notes, and snippets.

@mikkohei13
Last active December 31, 2024 14:16
Show Gist options
  • Save mikkohei13/4566d80bb3b2bf03bc28a12537c86054 to your computer and use it in GitHub Desktop.
Save mikkohei13/4566d80bb3b2bf03bc28a12537c86054 to your computer and use it in GitHub Desktop.
# Python script that creates a CSV file with image file paths and their corresponding categories
# Should work on both Linux & Windows, and with unicode filenames
'''
Input format:
directory;category
./lepidoptera/adult;adult
./lepidoptera/adult_specimen;adult_specimen
./lepidoptera/egg;egg
'''
import os
import csv
import sys
from pathlib import Path
# Base directory
BASE_DIR = "./insect_images"
# Input and output CSV file names
INPUT_CSV = "directories.csv"
OUTPUT_CSV = "images_with_categories.csv"
def validate_directory(directory_path):
"""
Check if a directory exists. If not, raise an error and exit the script.
"""
if not os.path.exists(directory_path):
print(f"Error: Directory '{directory_path}' does not exist.")
sys.exit(1)
def read_categories(input_csv):
"""
Read the directory-to-category mappings from the input CSV file.
"""
categories = {}
with open(input_csv, mode="r", encoding="utf-8") as file:
reader = csv.DictReader(file, delimiter=';')
for row in reader:
dir_path = os.path.join(BASE_DIR, row['directory'])
categories[dir_path] = row['category']
return categories
def get_image_files(directory_path):
"""
Get all image files (jpg, jpeg) in the given directory and subdirectories.
"""
image_extensions = (".jpg", ".jpeg")
image_files = []
# Walk through all files and subdirectories
for root, _, files in os.walk(directory_path):
for file in files:
if file.lower().endswith(image_extensions):
image_files.append(os.path.join(root, file))
return image_files
def generate_image_category_csv(categories, output_csv):
"""
Generate a CSV file with image file paths and their corresponding categories.
"""
with open(output_csv, mode="w", newline="", encoding="utf-8") as file:
writer = csv.writer(file, delimiter=';')
writer.writerow(["filepath", "category"])
for directory, category in categories.items():
validate_directory(directory) # Validate each directory
image_files = get_image_files(directory) # Get image files
for image_file in image_files:
# Normalize path separators to forward slashes
normalized_path = Path(image_file).as_posix()
writer.writerow([normalized_path, category])
def main():
"""
Main function to orchestrate the script.
"""
# Read categories from the input CSV
categories = read_categories(INPUT_CSV)
# Generate the output CSV with images and their categories
generate_image_category_csv(categories, OUTPUT_CSV)
print(f"Image list with categories saved to '{OUTPUT_CSV}'.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment