napsternxg · July 12, 2024 18:38
diff --git a/parse_gpc_data.py b/parse_gpc_data.py
 # Download the data from: https://gpc-browser.gs1.org/ using Download GPC as JSON

 import json

 data_file = "./data/GPC_May2024.json"
 output_file = "./data/GPC_May2024.flattened.json"

 with open(data_file) as fp:
    data = json.load(fp)

 # Sanity check
 for i, row in enumerate(data["Schema"]):
    # row.keys()=dict_keys(['Level', 'Code', 'Title', 'Definition', 'DefinitionExcludes', 'Active', 'Childs'])
    print(f"{i=}\t{row['Level']=}\t{row['Title']=}\t{len(row['Childs'])=}")


 def flatten_data(row, parent_path=None, parent_code=None, current_idx=None):
    if row["Title"] in {"YES", "NO", "UNIDENTIFIED", "UNCLASSIFIED"} or row[
        "Title"
    ].startswith("If "):
        return
    current_path = f"{parent_path} > {row['Title']}" if parent_path else row["Title"]
    current_code = f"{parent_code}.{row['Code']}" if parent_code else row["Code"]
    row_info = {k: v for k, v in row.items() if k != "Childs"}
    yield (current_path, current_code, current_idx, row_info)
    for i, child in enumerate(row["Childs"]):
        yield from flatten_data(child, current_path, current_code, f"{current_idx}.{i}")


 flattened_data = []
 for i, row in enumerate(data["Schema"]):
    for d in flatten_data(row, current_idx=i):
        (current_path, current_code, current_idx, row_info) = d
        # print((current_path, current_code))
        flattened_data.append(d)
        # if "Milk" in current_path:
        #     print(f"{i=}, {current_idx}, {current_path}")
 print(f"{len(flattened_data)=}")

 # Sanity check for parsed data

 for i, d in enumerate(filter(lambda x: "Milk" in x[0], flattened_data)):
    (current_path, current_code, current_idx, row_info) = d
    print(f"{i=}, {current_idx}, {current_code}, {current_path}\n{row_info=}")
    if i > 10:
        break

 with open(output_file, "w+") as fp:
    for (current_path, current_code, current_idx, row_info) in flattened_data:
        row_info = dict(row_info, current_path=current_path, current_code=current_code, current_idx=current_idx)
        print(json.dumps(row_info), file=fp)
	# Download the data from: https://gpc-browser.gs1.org/ using Download GPC as JSON

	import json

	data_file = "./data/GPC_May2024.json"
	output_file = "./data/GPC_May2024.flattened.json"

	with open(data_file) as fp:
	data = json.load(fp)

	# Sanity check
	for i, row in enumerate(data["Schema"]):
	# row.keys()=dict_keys(['Level', 'Code', 'Title', 'Definition', 'DefinitionExcludes', 'Active', 'Childs'])
	print(f"{i=}\t{row['Level']=}\t{row['Title']=}\t{len(row['Childs'])=}")


	def flatten_data(row, parent_path=None, parent_code=None, current_idx=None):
	if row["Title"] in {"YES", "NO", "UNIDENTIFIED", "UNCLASSIFIED"} or row[
	"Title"
	].startswith("If "):
	return
	current_path = f"{parent_path} > {row['Title']}" if parent_path else row["Title"]
	current_code = f"{parent_code}.{row['Code']}" if parent_code else row["Code"]
	row_info = {k: v for k, v in row.items() if k != "Childs"}
	yield (current_path, current_code, current_idx, row_info)
	for i, child in enumerate(row["Childs"]):
	yield from flatten_data(child, current_path, current_code, f"{current_idx}.{i}")


	flattened_data = []
	for i, row in enumerate(data["Schema"]):
	for d in flatten_data(row, current_idx=i):
	(current_path, current_code, current_idx, row_info) = d
	# print((current_path, current_code))
	flattened_data.append(d)
	# if "Milk" in current_path:
	# print(f"{i=}, {current_idx}, {current_path}")
	print(f"{len(flattened_data)=}")

	# Sanity check for parsed data

	for i, d in enumerate(filter(lambda x: "Milk" in x[0], flattened_data)):
	(current_path, current_code, current_idx, row_info) = d
	print(f"{i=}, {current_idx}, {current_code}, {current_path}\n{row_info=}")
	if i > 10:
	break

	with open(output_file, "w+") as fp:
	for (current_path, current_code, current_idx, row_info) in flattened_data:
	row_info = dict(row_info, current_path=current_path, current_code=current_code, current_idx=current_idx)
	print(json.dumps(row_info), file=fp)