Skip to content

Instantly share code, notes, and snippets.

@imajes
Created July 29, 2025 20:46
Show Gist options
  • Save imajes/d62862b132185ac4162ff997025ef42d to your computer and use it in GitHub Desktop.
Save imajes/d62862b132185ac4162ff997025ef42d to your computer and use it in GitHub Desktop.
apple-aerial-asset-parser
#!/usr/bin/env python3
"""
process_entries.py
Usage:
python process_entries.py entries.json > final_output.json
Outputs a JSON object with:
- assets: { CategoryName: { SubcategoryName: [ …assets… ] } }
- taxonomy: [ { id, name, representative_asset_id, preview_image, subcategories: [ … ] } ]
"""
import json
import sys
import re
from collections import defaultdict
# === which asset fields to pull & how to rename them ===
ASSET_FIELDS = {
"accessibilityLabel": "label",
"id": "asset_id",
"localizedNameKey": "name",
"previewImage": "preview_image",
"shotID": "shot_id",
"url-4K-SDR-240FPS": "video_url",
}
# regex helpers
_RE_CAMEL = re.compile(r"(?<=[a-z])(?=[A-Z])")
_RE_BREAK_TO_THE = re.compile(r"([a-z])tothe([A-Z])", flags=re.IGNORECASE)
_RE_BREAK_TO = re.compile(r"([a-z])to([A-Z])", flags=re.IGNORECASE)
_RE_BREAK_AND = re.compile(r"([a-z])and([A-Z])", flags=re.IGNORECASE)
def insert_spaces(s: str) -> str:
"""Naively put spaces before camel-case transitions."""
return _RE_CAMEL.sub(" ", s)
def finalize_label(raw: str) -> str:
"""
From a raw CamelCase-ish string, insert spaces, then lowercase/normalize
common conjunctions/articles.
"""
# 1) break up fused “to the”
s = _RE_BREAK_TO_THE.sub(r"\1 to the \2", raw)
# 2) break up fused “to”
s = _RE_BREAK_TO.sub(r"\1 to \2", s)
# 3) break up fused “and”
s = _RE_BREAK_AND.sub(r"\1 and \2", s)
# 4) camel → spaces
s = insert_spaces(s)
# 5) lowercase the little words
tokens = s.split()
tokens = [t.lower() if t.lower() in ("to", "and", "the") else t for t in tokens]
return " ".join(tokens).strip()
def clean_category_name(raw: str) -> str:
"""Remove the AerialCategory prefix if present."""
return raw[len("AerialCategory") :] if raw.startswith("AerialCategory") else raw
def clean_subcategory_name(raw: str, parent_cat: str) -> str:
"""
Strip AerialSubcategory prefix + (if Cities) its own Cities prefix,
then finalize spacing & casing.
"""
name = raw
if name.startswith("AerialSubcategory"):
name = name[len("AerialSubcategory") :]
if parent_cat == "Cities" and name.startswith("Cities"):
name = name[len("Cities") :]
new_label = finalize_label(name)
# one little override for the mis-split case
if new_label == "Gr and Canyon":
return "Grand Canyon"
if new_label == "Caribbe and ay":
return "Caribbean Day"
return new_label
def main():
if len(sys.argv) != 2:
print("Usage: python process_entries.py entries.json", file=sys.stderr)
sys.exit(1)
entries = json.load(open(sys.argv[1], "r"))
# map category ID → full record
cats_by_id = {c["id"]: c for c in entries.get("categories", [])}
# 1) Build taxonomy
taxonomy = []
for c in entries.get("categories", []):
cat_name = clean_category_name(c["localizedNameKey"])
# gather and sort subcats
subs = []
for sub in c.get("subcategories", []):
subs.append(
{
"id": sub["id"],
"name": clean_subcategory_name(sub["localizedNameKey"], cat_name),
"representative_asset_id": sub["representativeAssetID"],
"preview_image": sub.get("previewImage", ""),
}
)
subs.sort(key=lambda x: x["name"])
taxonomy.append(
{
"id": c["id"],
"name": cat_name,
"representative_asset_id": c["representativeAssetID"],
"preview_image": c.get("previewImage", ""),
"subcategories": subs,
}
)
# 2) Group assets
grouped = defaultdict(lambda: defaultdict(list))
for a in entries.get("assets", []):
asset_obj = {new: a.get(old) for old, new in ASSET_FIELDS.items()}
for cat_id in a.get("categories", []):
c = cats_by_id.get(cat_id)
if not c:
continue
cat_name = clean_category_name(c["localizedNameKey"])
# find subcats this asset belongs to
for sub in c.get("subcategories", []):
if sub["id"] in a.get("subcategories", []):
sub_name = clean_subcategory_name(sub["localizedNameKey"], cat_name)
grouped[cat_name][sub_name].append(asset_obj)
# convert to regular dicts with sorted subcategory keys
assets_out = {}
for cat, subs in grouped.items():
assets_out[cat] = {sub_name: subs[sub_name] for sub_name in sorted(subs.keys())}
# final payload
result = {"assets": assets_out, "taxonomy": taxonomy}
json.dump(result, sys.stdout, indent=2)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment