Skip to content

Instantly share code, notes, and snippets.

@napsternxg
Created July 12, 2024 18:38
Show Gist options
  • Save napsternxg/419ae515e094d4e0e084ffcbea969578 to your computer and use it in GitHub Desktop.
Save napsternxg/419ae515e094d4e0e084ffcbea969578 to your computer and use it in GitHub Desktop.
Parse GPC Dataset https://gpc-browser.gs1.org/
# Download the data from: https://gpc-browser.gs1.org/ using Download GPC as JSON
import json
data_file = "./data/GPC_May2024.json"
output_file = "./data/GPC_May2024.flattened.json"
with open(data_file) as fp:
data = json.load(fp)
# Sanity check
for i, row in enumerate(data["Schema"]):
# row.keys()=dict_keys(['Level', 'Code', 'Title', 'Definition', 'DefinitionExcludes', 'Active', 'Childs'])
print(f"{i=}\t{row['Level']=}\t{row['Title']=}\t{len(row['Childs'])=}")
def flatten_data(row, parent_path=None, parent_code=None, current_idx=None):
if row["Title"] in {"YES", "NO", "UNIDENTIFIED", "UNCLASSIFIED"} or row[
"Title"
].startswith("If "):
return
current_path = f"{parent_path} > {row['Title']}" if parent_path else row["Title"]
current_code = f"{parent_code}.{row['Code']}" if parent_code else row["Code"]
row_info = {k: v for k, v in row.items() if k != "Childs"}
yield (current_path, current_code, current_idx, row_info)
for i, child in enumerate(row["Childs"]):
yield from flatten_data(child, current_path, current_code, f"{current_idx}.{i}")
flattened_data = []
for i, row in enumerate(data["Schema"]):
for d in flatten_data(row, current_idx=i):
(current_path, current_code, current_idx, row_info) = d
# print((current_path, current_code))
flattened_data.append(d)
# if "Milk" in current_path:
# print(f"{i=}, {current_idx}, {current_path}")
print(f"{len(flattened_data)=}")
# Sanity check for parsed data
for i, d in enumerate(filter(lambda x: "Milk" in x[0], flattened_data)):
(current_path, current_code, current_idx, row_info) = d
print(f"{i=}, {current_idx}, {current_code}, {current_path}\n{row_info=}")
if i > 10:
break
with open(output_file, "w+") as fp:
for (current_path, current_code, current_idx, row_info) in flattened_data:
row_info = dict(row_info, current_path=current_path, current_code=current_code, current_idx=current_idx)
print(json.dumps(row_info), file=fp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment