Created
August 27, 2024 16:03
-
-
Save ZackAkil/f58b8c9e2a9a7493357eb45a48d13988 to your computer and use it in GitHub Desktop.
This Python script automates the process of converting assisted labeled images stored in Google Cloud Storage into a labeled dataset suitable for training an AutoML object detection model. It extracts bounding box annotations from image metadata, structures them into the AutoML format,
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import necessary libraries | |
import argparse # Command-line argument parsing | |
import json # JSON handling | |
# Google Cloud Storage library for interacting with buckets and objects | |
from google.cloud import storage | |
# Function to extract bucket name and folder from a Google Storage location | |
def get_bucket_folder(location): | |
""" | |
Extracts the bucket name and folder (if present) from a Google Storage location string. | |
Args: | |
location: A string representing the Google Storage location (e.g., 'my-bucket/images/'). | |
Returns: | |
A tuple containing the bucket name and folder. | |
""" | |
parts = location.split('/', 1) # Split at the first '/' to separate bucket and folder | |
bucket_name = parts[0] | |
if len(parts) > 1: # If a folder is present | |
folder = parts[1] | |
else: | |
folder = '' # No folder specified | |
return bucket_name, folder | |
# Function to extract label metadata from raw metadata (presumably from Google Storage object) | |
def extract_label_metadata(raw_metadata): | |
""" | |
Extracts label metadata (assumed to be in JSON format) from raw metadata. | |
Args: | |
raw_metadata: A dictionary containing raw metadata, potentially with a 'labels' key. | |
Returns: | |
A dictionary containing the extracted label metadata. | |
""" | |
labels_metadata = raw_metadata.get('labels') # Get the 'labels' value, if present | |
return json.loads(labels_metadata) # Parse the JSON string into a dictionary | |
# Function to create a label record in AutoML format | |
def create_automl_label_json(file_location, bboxes): | |
""" | |
Creates a label record in AutoML format from file location and bounding box information. | |
Args: | |
file_location: The Google Storage URI of the image file. | |
bboxes: A list of dictionaries, each containing bounding box data (label_name, xmin, ymin, xmax, ymax). | |
Returns: | |
A dictionary representing a single label record for AutoML. | |
""" | |
boundingBoxAnnotations = [] # List to store bounding box annotations | |
for box in bboxes: # Iterate through bounding boxes | |
label_object = { # Create a label object for each box | |
'displayName': box['label_name'], | |
'yMin': box['ymin'], | |
'xMin': box['xmin'], | |
'yMax': box['ymax'], | |
'xMax': box['xmax'] | |
} | |
boundingBoxAnnotations.append(label_object) # Add to the annotations list | |
label_record = { # Create the final label record | |
'imageGcsUri': file_location, | |
'boundingBoxAnnotations': boundingBoxAnnotations | |
} | |
return label_record | |
# Main execution block | |
if __name__ == "__main__": | |
# Set up command-line argument parsing | |
parser = argparse.ArgumentParser( | |
description='Turn assisted labeled images in Google Storage into a labeled dataset for AutoML.') | |
parser.add_argument('gs_location', metavar='gsl', type=str, | |
help='Location of Google Cloud Storage folder.') | |
args = parser.parse_args() # Parse the arguments | |
gs_location = args.gs_location | |
# Process the Google Storage location | |
location = gs_location.replace('gs://', '') # Remove the 'gs://' prefix | |
# Extract bucket name and folder | |
bucket_name, folder = get_bucket_folder(location) | |
# Initialize Google Cloud Storage client | |
storage_client = storage.Client() | |
# List blobs (objects) in the specified bucket and folder | |
blobs = storage_client.list_blobs(bucket_name, prefix=folder) | |
label_records_list = [] # List to store generated label records | |
# Iterate through the blobs | |
for blob in blobs: | |
if (blob.metadata is None): # Skip blobs without metadata | |
print(f'Skipping {blob.name}, no metadata') | |
continue | |
# Extract label metadata from the blob's metadata | |
labels_metadata = extract_label_metadata(blob.metadata) | |
# Construct the full Google Storage URI for the blob | |
full_file_name = f'gs://{bucket_name}/{blob.name}' | |
# Create an AutoML label record | |
label_record = create_automl_label_json(full_file_name, labels_metadata) | |
# Add the record to the list | |
label_records_list.append(label_record) | |
# Write the label records to a JSONL file | |
with open('labels.jsonl', 'w') as f: | |
for record in label_records_list: | |
record_line = json.dumps(record) | |
f.write(record_line + "\n") # Each record on a new line |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment