Skip to content

Instantly share code, notes, and snippets.

@ZackAkil
Created August 27, 2024 16:03
Show Gist options
  • Save ZackAkil/f58b8c9e2a9a7493357eb45a48d13988 to your computer and use it in GitHub Desktop.
Save ZackAkil/f58b8c9e2a9a7493357eb45a48d13988 to your computer and use it in GitHub Desktop.
This Python script automates the process of converting assisted labeled images stored in Google Cloud Storage into a labeled dataset suitable for training an AutoML object detection model. It extracts bounding box annotations from image metadata, structures them into the AutoML format,
# Import necessary libraries
import argparse # Command-line argument parsing
import json # JSON handling
# Google Cloud Storage library for interacting with buckets and objects
from google.cloud import storage
# Function to extract bucket name and folder from a Google Storage location
def get_bucket_folder(location):
"""
Extracts the bucket name and folder (if present) from a Google Storage location string.
Args:
location: A string representing the Google Storage location (e.g., 'my-bucket/images/').
Returns:
A tuple containing the bucket name and folder.
"""
parts = location.split('/', 1) # Split at the first '/' to separate bucket and folder
bucket_name = parts[0]
if len(parts) > 1: # If a folder is present
folder = parts[1]
else:
folder = '' # No folder specified
return bucket_name, folder
# Function to extract label metadata from raw metadata (presumably from Google Storage object)
def extract_label_metadata(raw_metadata):
"""
Extracts label metadata (assumed to be in JSON format) from raw metadata.
Args:
raw_metadata: A dictionary containing raw metadata, potentially with a 'labels' key.
Returns:
A dictionary containing the extracted label metadata.
"""
labels_metadata = raw_metadata.get('labels') # Get the 'labels' value, if present
return json.loads(labels_metadata) # Parse the JSON string into a dictionary
# Function to create a label record in AutoML format
def create_automl_label_json(file_location, bboxes):
"""
Creates a label record in AutoML format from file location and bounding box information.
Args:
file_location: The Google Storage URI of the image file.
bboxes: A list of dictionaries, each containing bounding box data (label_name, xmin, ymin, xmax, ymax).
Returns:
A dictionary representing a single label record for AutoML.
"""
boundingBoxAnnotations = [] # List to store bounding box annotations
for box in bboxes: # Iterate through bounding boxes
label_object = { # Create a label object for each box
'displayName': box['label_name'],
'yMin': box['ymin'],
'xMin': box['xmin'],
'yMax': box['ymax'],
'xMax': box['xmax']
}
boundingBoxAnnotations.append(label_object) # Add to the annotations list
label_record = { # Create the final label record
'imageGcsUri': file_location,
'boundingBoxAnnotations': boundingBoxAnnotations
}
return label_record
# Main execution block
if __name__ == "__main__":
# Set up command-line argument parsing
parser = argparse.ArgumentParser(
description='Turn assisted labeled images in Google Storage into a labeled dataset for AutoML.')
parser.add_argument('gs_location', metavar='gsl', type=str,
help='Location of Google Cloud Storage folder.')
args = parser.parse_args() # Parse the arguments
gs_location = args.gs_location
# Process the Google Storage location
location = gs_location.replace('gs://', '') # Remove the 'gs://' prefix
# Extract bucket name and folder
bucket_name, folder = get_bucket_folder(location)
# Initialize Google Cloud Storage client
storage_client = storage.Client()
# List blobs (objects) in the specified bucket and folder
blobs = storage_client.list_blobs(bucket_name, prefix=folder)
label_records_list = [] # List to store generated label records
# Iterate through the blobs
for blob in blobs:
if (blob.metadata is None): # Skip blobs without metadata
print(f'Skipping {blob.name}, no metadata')
continue
# Extract label metadata from the blob's metadata
labels_metadata = extract_label_metadata(blob.metadata)
# Construct the full Google Storage URI for the blob
full_file_name = f'gs://{bucket_name}/{blob.name}'
# Create an AutoML label record
label_record = create_automl_label_json(full_file_name, labels_metadata)
# Add the record to the list
label_records_list.append(label_record)
# Write the label records to a JSONL file
with open('labels.jsonl', 'w') as f:
for record in label_records_list:
record_line = json.dumps(record)
f.write(record_line + "\n") # Each record on a new line
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment