Created
July 12, 2024 18:38
-
-
Save napsternxg/419ae515e094d4e0e084ffcbea969578 to your computer and use it in GitHub Desktop.
Parse GPC Dataset https://gpc-browser.gs1.org/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Download the data from: https://gpc-browser.gs1.org/ using Download GPC as JSON | |
import json | |
data_file = "./data/GPC_May2024.json" | |
output_file = "./data/GPC_May2024.flattened.json" | |
with open(data_file) as fp: | |
data = json.load(fp) | |
# Sanity check | |
for i, row in enumerate(data["Schema"]): | |
# row.keys()=dict_keys(['Level', 'Code', 'Title', 'Definition', 'DefinitionExcludes', 'Active', 'Childs']) | |
print(f"{i=}\t{row['Level']=}\t{row['Title']=}\t{len(row['Childs'])=}") | |
def flatten_data(row, parent_path=None, parent_code=None, current_idx=None): | |
if row["Title"] in {"YES", "NO", "UNIDENTIFIED", "UNCLASSIFIED"} or row[ | |
"Title" | |
].startswith("If "): | |
return | |
current_path = f"{parent_path} > {row['Title']}" if parent_path else row["Title"] | |
current_code = f"{parent_code}.{row['Code']}" if parent_code else row["Code"] | |
row_info = {k: v for k, v in row.items() if k != "Childs"} | |
yield (current_path, current_code, current_idx, row_info) | |
for i, child in enumerate(row["Childs"]): | |
yield from flatten_data(child, current_path, current_code, f"{current_idx}.{i}") | |
flattened_data = [] | |
for i, row in enumerate(data["Schema"]): | |
for d in flatten_data(row, current_idx=i): | |
(current_path, current_code, current_idx, row_info) = d | |
# print((current_path, current_code)) | |
flattened_data.append(d) | |
# if "Milk" in current_path: | |
# print(f"{i=}, {current_idx}, {current_path}") | |
print(f"{len(flattened_data)=}") | |
# Sanity check for parsed data | |
for i, d in enumerate(filter(lambda x: "Milk" in x[0], flattened_data)): | |
(current_path, current_code, current_idx, row_info) = d | |
print(f"{i=}, {current_idx}, {current_code}, {current_path}\n{row_info=}") | |
if i > 10: | |
break | |
with open(output_file, "w+") as fp: | |
for (current_path, current_code, current_idx, row_info) in flattened_data: | |
row_info = dict(row_info, current_path=current_path, current_code=current_code, current_idx=current_idx) | |
print(json.dumps(row_info), file=fp) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment