imajes · July 29, 2025 20:46
diff --git a/process_entries.py b/process_entries.py
 #!/usr/bin/env python3
 """
 process_entries.py

 Usage:
  python process_entries.py entries.json > final_output.json

 Outputs a JSON object with:
  - assets: { CategoryName: { SubcategoryName: [ …assets… ] } }
  - taxonomy: [ { id, name, representative_asset_id, preview_image, subcategories: [ … ] } ]
 """

 import json
 import sys
 import re
 from collections import defaultdict

 # === which asset fields to pull & how to rename them ===
 ASSET_FIELDS = {
    "accessibilityLabel": "label",
    "id": "asset_id",
    "localizedNameKey": "name",
    "previewImage": "preview_image",
    "shotID": "shot_id",
    "url-4K-SDR-240FPS": "video_url",
 }

 # regex helpers
 _RE_CAMEL = re.compile(r"(?<=[a-z])(?=[A-Z])")
 _RE_BREAK_TO_THE = re.compile(r"([a-z])tothe([A-Z])", flags=re.IGNORECASE)
 _RE_BREAK_TO = re.compile(r"([a-z])to([A-Z])", flags=re.IGNORECASE)
 _RE_BREAK_AND = re.compile(r"([a-z])and([A-Z])", flags=re.IGNORECASE)


 def insert_spaces(s: str) -> str:
    """Naively put spaces before camel-case transitions."""
    return _RE_CAMEL.sub(" ", s)


 def finalize_label(raw: str) -> str:
    """
    From a raw CamelCase-ish string, insert spaces, then lowercase/normalize
    common conjunctions/articles.
    """
    # 1) break up fused “to the”
    s = _RE_BREAK_TO_THE.sub(r"\1 to the \2", raw)
    # 2) break up fused “to”
    s = _RE_BREAK_TO.sub(r"\1 to \2", s)
    # 3) break up fused “and”
    s = _RE_BREAK_AND.sub(r"\1 and \2", s)
    # 4) camel → spaces
    s = insert_spaces(s)
    # 5) lowercase the little words
    tokens = s.split()
    tokens = [t.lower() if t.lower() in ("to", "and", "the") else t for t in tokens]
    return " ".join(tokens).strip()


 def clean_category_name(raw: str) -> str:
    """Remove the AerialCategory prefix if present."""
    return raw[len("AerialCategory") :] if raw.startswith("AerialCategory") else raw


 def clean_subcategory_name(raw: str, parent_cat: str) -> str:
    """
    Strip AerialSubcategory prefix + (if Cities) its own Cities prefix,
    then finalize spacing & casing.
    """
    name = raw
    if name.startswith("AerialSubcategory"):
        name = name[len("AerialSubcategory") :]
    if parent_cat == "Cities" and name.startswith("Cities"):
        name = name[len("Cities") :]
    new_label = finalize_label(name)
    # one little override for the mis-split case
    if new_label == "Gr and Canyon":
        return "Grand Canyon"
    if new_label == "Caribbe and ay":
        return "Caribbean Day"
    return new_label


 def main():
    if len(sys.argv) != 2:
        print("Usage: python process_entries.py entries.json", file=sys.stderr)
        sys.exit(1)

    entries = json.load(open(sys.argv[1], "r"))

    # map category ID → full record
    cats_by_id = {c["id"]: c for c in entries.get("categories", [])}

    # 1) Build taxonomy
    taxonomy = []
    for c in entries.get("categories", []):
        cat_name = clean_category_name(c["localizedNameKey"])
        # gather and sort subcats
        subs = []
        for sub in c.get("subcategories", []):
            subs.append(
                {
                    "id": sub["id"],
                    "name": clean_subcategory_name(sub["localizedNameKey"], cat_name),
                    "representative_asset_id": sub["representativeAssetID"],
                    "preview_image": sub.get("previewImage", ""),
                }
            )
        subs.sort(key=lambda x: x["name"])
        taxonomy.append(
            {
                "id": c["id"],
                "name": cat_name,
                "representative_asset_id": c["representativeAssetID"],
                "preview_image": c.get("previewImage", ""),
                "subcategories": subs,
            }
        )

    # 2) Group assets
    grouped = defaultdict(lambda: defaultdict(list))
    for a in entries.get("assets", []):
        asset_obj = {new: a.get(old) for old, new in ASSET_FIELDS.items()}

        for cat_id in a.get("categories", []):
            c = cats_by_id.get(cat_id)
            if not c:
                continue
            cat_name = clean_category_name(c["localizedNameKey"])
            # find subcats this asset belongs to
            for sub in c.get("subcategories", []):
                if sub["id"] in a.get("subcategories", []):
                    sub_name = clean_subcategory_name(sub["localizedNameKey"], cat_name)
                    grouped[cat_name][sub_name].append(asset_obj)

    # convert to regular dicts with sorted subcategory keys
    assets_out = {}
    for cat, subs in grouped.items():
        assets_out[cat] = {sub_name: subs[sub_name] for sub_name in sorted(subs.keys())}

    # final payload
    result = {"assets": assets_out, "taxonomy": taxonomy}

    json.dump(result, sys.stdout, indent=2)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	process_entries.py

	Usage:
	python process_entries.py entries.json > final_output.json

	Outputs a JSON object with:
	- assets: { CategoryName: { SubcategoryName: [ …assets… ] } }
	- taxonomy: [ { id, name, representative_asset_id, preview_image, subcategories: [ … ] } ]
	"""

	import json
	import sys
	import re
	from collections import defaultdict

	# === which asset fields to pull & how to rename them ===
	ASSET_FIELDS = {
	"accessibilityLabel": "label",
	"id": "asset_id",
	"localizedNameKey": "name",
	"previewImage": "preview_image",
	"shotID": "shot_id",
	"url-4K-SDR-240FPS": "video_url",
	}

	# regex helpers
	_RE_CAMEL = re.compile(r"(?<=[a-z])(?=[A-Z])")
	_RE_BREAK_TO_THE = re.compile(r"([a-z])tothe([A-Z])", flags=re.IGNORECASE)
	_RE_BREAK_TO = re.compile(r"([a-z])to([A-Z])", flags=re.IGNORECASE)
	_RE_BREAK_AND = re.compile(r"([a-z])and([A-Z])", flags=re.IGNORECASE)


	def insert_spaces(s: str) -> str:
	"""Naively put spaces before camel-case transitions."""
	return _RE_CAMEL.sub(" ", s)


	def finalize_label(raw: str) -> str:
	"""
	From a raw CamelCase-ish string, insert spaces, then lowercase/normalize
	common conjunctions/articles.
	"""
	# 1) break up fused “to the”
	s = _RE_BREAK_TO_THE.sub(r"\1 to the \2", raw)
	# 2) break up fused “to”
	s = _RE_BREAK_TO.sub(r"\1 to \2", s)
	# 3) break up fused “and”
	s = _RE_BREAK_AND.sub(r"\1 and \2", s)
	# 4) camel → spaces
	s = insert_spaces(s)
	# 5) lowercase the little words
	tokens = s.split()
	tokens = [t.lower() if t.lower() in ("to", "and", "the") else t for t in tokens]
	return " ".join(tokens).strip()


	def clean_category_name(raw: str) -> str:
	"""Remove the AerialCategory prefix if present."""
	return raw[len("AerialCategory") :] if raw.startswith("AerialCategory") else raw


	def clean_subcategory_name(raw: str, parent_cat: str) -> str:
	"""
	Strip AerialSubcategory prefix + (if Cities) its own Cities prefix,
	then finalize spacing & casing.
	"""
	name = raw
	if name.startswith("AerialSubcategory"):
	name = name[len("AerialSubcategory") :]
	if parent_cat == "Cities" and name.startswith("Cities"):
	name = name[len("Cities") :]
	new_label = finalize_label(name)
	# one little override for the mis-split case
	if new_label == "Gr and Canyon":
	return "Grand Canyon"
	if new_label == "Caribbe and ay":
	return "Caribbean Day"
	return new_label


	def main():
	if len(sys.argv) != 2:
	print("Usage: python process_entries.py entries.json", file=sys.stderr)
	sys.exit(1)

	entries = json.load(open(sys.argv[1], "r"))

	# map category ID → full record
	cats_by_id = {c["id"]: c for c in entries.get("categories", [])}

	# 1) Build taxonomy
	taxonomy = []
	for c in entries.get("categories", []):
	cat_name = clean_category_name(c["localizedNameKey"])
	# gather and sort subcats
	subs = []
	for sub in c.get("subcategories", []):
	subs.append(
	{
	"id": sub["id"],
	"name": clean_subcategory_name(sub["localizedNameKey"], cat_name),
	"representative_asset_id": sub["representativeAssetID"],
	"preview_image": sub.get("previewImage", ""),
	}
	)
	subs.sort(key=lambda x: x["name"])
	taxonomy.append(
	{
	"id": c["id"],
	"name": cat_name,
	"representative_asset_id": c["representativeAssetID"],
	"preview_image": c.get("previewImage", ""),
	"subcategories": subs,
	}
	)

	# 2) Group assets
	grouped = defaultdict(lambda: defaultdict(list))
	for a in entries.get("assets", []):
	asset_obj = {new: a.get(old) for old, new in ASSET_FIELDS.items()}

	for cat_id in a.get("categories", []):
	c = cats_by_id.get(cat_id)
	if not c:
	continue
	cat_name = clean_category_name(c["localizedNameKey"])
	# find subcats this asset belongs to
	for sub in c.get("subcategories", []):
	if sub["id"] in a.get("subcategories", []):
	sub_name = clean_subcategory_name(sub["localizedNameKey"], cat_name)
	grouped[cat_name][sub_name].append(asset_obj)

	# convert to regular dicts with sorted subcategory keys
	assets_out = {}
	for cat, subs in grouped.items():
	assets_out[cat] = {sub_name: subs[sub_name] for sub_name in sorted(subs.keys())}

	# final payload
	result = {"assets": assets_out, "taxonomy": taxonomy}

	json.dump(result, sys.stdout, indent=2)


	if __name__ == "__main__":
	main()