Created
July 29, 2025 20:46
-
-
Save imajes/d62862b132185ac4162ff997025ef42d to your computer and use it in GitHub Desktop.
apple-aerial-asset-parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| process_entries.py | |
| Usage: | |
| python process_entries.py entries.json > final_output.json | |
| Outputs a JSON object with: | |
| - assets: { CategoryName: { SubcategoryName: [ …assets… ] } } | |
| - taxonomy: [ { id, name, representative_asset_id, preview_image, subcategories: [ … ] } ] | |
| """ | |
| import json | |
| import sys | |
| import re | |
| from collections import defaultdict | |
| # === which asset fields to pull & how to rename them === | |
| ASSET_FIELDS = { | |
| "accessibilityLabel": "label", | |
| "id": "asset_id", | |
| "localizedNameKey": "name", | |
| "previewImage": "preview_image", | |
| "shotID": "shot_id", | |
| "url-4K-SDR-240FPS": "video_url", | |
| } | |
| # regex helpers | |
| _RE_CAMEL = re.compile(r"(?<=[a-z])(?=[A-Z])") | |
| _RE_BREAK_TO_THE = re.compile(r"([a-z])tothe([A-Z])", flags=re.IGNORECASE) | |
| _RE_BREAK_TO = re.compile(r"([a-z])to([A-Z])", flags=re.IGNORECASE) | |
| _RE_BREAK_AND = re.compile(r"([a-z])and([A-Z])", flags=re.IGNORECASE) | |
| def insert_spaces(s: str) -> str: | |
| """Naively put spaces before camel-case transitions.""" | |
| return _RE_CAMEL.sub(" ", s) | |
| def finalize_label(raw: str) -> str: | |
| """ | |
| From a raw CamelCase-ish string, insert spaces, then lowercase/normalize | |
| common conjunctions/articles. | |
| """ | |
| # 1) break up fused “to the” | |
| s = _RE_BREAK_TO_THE.sub(r"\1 to the \2", raw) | |
| # 2) break up fused “to” | |
| s = _RE_BREAK_TO.sub(r"\1 to \2", s) | |
| # 3) break up fused “and” | |
| s = _RE_BREAK_AND.sub(r"\1 and \2", s) | |
| # 4) camel → spaces | |
| s = insert_spaces(s) | |
| # 5) lowercase the little words | |
| tokens = s.split() | |
| tokens = [t.lower() if t.lower() in ("to", "and", "the") else t for t in tokens] | |
| return " ".join(tokens).strip() | |
| def clean_category_name(raw: str) -> str: | |
| """Remove the AerialCategory prefix if present.""" | |
| return raw[len("AerialCategory") :] if raw.startswith("AerialCategory") else raw | |
| def clean_subcategory_name(raw: str, parent_cat: str) -> str: | |
| """ | |
| Strip AerialSubcategory prefix + (if Cities) its own Cities prefix, | |
| then finalize spacing & casing. | |
| """ | |
| name = raw | |
| if name.startswith("AerialSubcategory"): | |
| name = name[len("AerialSubcategory") :] | |
| if parent_cat == "Cities" and name.startswith("Cities"): | |
| name = name[len("Cities") :] | |
| new_label = finalize_label(name) | |
| # one little override for the mis-split case | |
| if new_label == "Gr and Canyon": | |
| return "Grand Canyon" | |
| if new_label == "Caribbe and ay": | |
| return "Caribbean Day" | |
| return new_label | |
| def main(): | |
| if len(sys.argv) != 2: | |
| print("Usage: python process_entries.py entries.json", file=sys.stderr) | |
| sys.exit(1) | |
| entries = json.load(open(sys.argv[1], "r")) | |
| # map category ID → full record | |
| cats_by_id = {c["id"]: c for c in entries.get("categories", [])} | |
| # 1) Build taxonomy | |
| taxonomy = [] | |
| for c in entries.get("categories", []): | |
| cat_name = clean_category_name(c["localizedNameKey"]) | |
| # gather and sort subcats | |
| subs = [] | |
| for sub in c.get("subcategories", []): | |
| subs.append( | |
| { | |
| "id": sub["id"], | |
| "name": clean_subcategory_name(sub["localizedNameKey"], cat_name), | |
| "representative_asset_id": sub["representativeAssetID"], | |
| "preview_image": sub.get("previewImage", ""), | |
| } | |
| ) | |
| subs.sort(key=lambda x: x["name"]) | |
| taxonomy.append( | |
| { | |
| "id": c["id"], | |
| "name": cat_name, | |
| "representative_asset_id": c["representativeAssetID"], | |
| "preview_image": c.get("previewImage", ""), | |
| "subcategories": subs, | |
| } | |
| ) | |
| # 2) Group assets | |
| grouped = defaultdict(lambda: defaultdict(list)) | |
| for a in entries.get("assets", []): | |
| asset_obj = {new: a.get(old) for old, new in ASSET_FIELDS.items()} | |
| for cat_id in a.get("categories", []): | |
| c = cats_by_id.get(cat_id) | |
| if not c: | |
| continue | |
| cat_name = clean_category_name(c["localizedNameKey"]) | |
| # find subcats this asset belongs to | |
| for sub in c.get("subcategories", []): | |
| if sub["id"] in a.get("subcategories", []): | |
| sub_name = clean_subcategory_name(sub["localizedNameKey"], cat_name) | |
| grouped[cat_name][sub_name].append(asset_obj) | |
| # convert to regular dicts with sorted subcategory keys | |
| assets_out = {} | |
| for cat, subs in grouped.items(): | |
| assets_out[cat] = {sub_name: subs[sub_name] for sub_name in sorted(subs.keys())} | |
| # final payload | |
| result = {"assets": assets_out, "taxonomy": taxonomy} | |
| json.dump(result, sys.stdout, indent=2) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment