Last active
February 11, 2022 19:58
-
-
Save gdbassett/a9e0e0c25c1a4951e143806dd1d13909 to your computer and use it in GitHub Desktop.
Convert json to json-ld. I couldn't find any examples in python so produced one. This is very basic as I'm still learning json-ld. It does not use bnodes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.parse import quote, unquote | |
import logging | |
from uuid import uuid4 | |
from collections import defaultdict | |
def flatten(l): | |
for el in l: | |
if type(el) in [list, tuple]: | |
yield from flatten(el) | |
else: | |
yield el | |
def json2linked(j, name, ns, unique_id=unique_id, properties=False, objects=False, simple_ids=True): | |
# recurse the data to build json-ld objects | |
data = json2linked_r(j, name, properties=properties, objects=objects) | |
# add the data to a very basic context | |
linked = { | |
"@context": | |
{ | |
"@base": ns, | |
"@vocab": ns, | |
"owl": "http://www.w3.org/2002/07/owl#", | |
"xsd": "http://www.w3.org/2001/XMLSchema#", | |
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", | |
"rdfs": "http://www.w3.org/2000/01/rdf-schema#" | |
}, | |
"@graph": data} | |
# We use UUIDs to identify unique objects in the initial pass | |
# Next we replace those with sequenced numbers | |
if simple_ids: | |
# First we collect a dictionary of all the objects (location is list will be their number) | |
obj_map = defaultdict(list) | |
for obj in linked['@graph']: | |
if "owl:NamedIndividual" in obj["@type"]: | |
obj_map[obj["@id"].split("_obj")[0]].append(obj["@id"]) | |
#print(obj_map) | |
# Now we replaced the UUIDs with the location in the map list for more legible id numbers | |
for j in range(len(linked['@graph'])): | |
obj = linked['@graph'][j] | |
short_id = obj['@id'].split("_obj")[0] | |
if obj['@id'] in obj_map.get(short_id, []): | |
obj['@id'] = short_id if len(obj_map[short_id]) <= 1 else short_id + "_" + str(obj_map[short_id].index(obj['@id'])+1) | |
for k in obj.keys(): | |
if type(obj[k]) == dict and "@id" in obj[k]: | |
short_id = obj[k]['@id'].split("_obj")[0] | |
if obj[k]['@id'] in obj_map.get(short_id, []): | |
obj[k]['@id'] = short_id if len(obj_map[short_id]) <= 1 else short_id + "_" + str(obj_map[short_id].index(obj[k]['@id'])+1) | |
elif type(obj[k]) == list: | |
for i in range(len(obj[k])): | |
if type(obj[k][i]) == dict and "@id" in obj[k][i]: | |
short_id = obj[k][i]['@id'].split("_obj")[0] | |
if obj[k][i]['@id'] in obj_map.get(short_id, []): | |
obj[k][i]['@id'] = short_id if len(obj_map[short_id]) <= 1 else short_id + "_" + str(obj_map[short_id].index(obj[k][i]['@id']) + 1) | |
linked['@graph'][j] = obj | |
return(linked) | |
def json2linked_r(d, name, unique_id="", properties=False, objects=False): | |
# if 'record', create an object to represent the record pointing at the root oject(s) | |
type_map = { | |
int: "xsd:integer", | |
float: "xsd:float", | |
str: "xsd:string", | |
bool: "xsd:boolean" | |
} | |
ret = list() | |
obj_names = set() | |
#print("d: {0}, lbl: {1}, name: {2}, parent: {3}, child: {4}".format(d, lbl, name, parent, child)) | |
try: | |
if type(d) == dict: | |
# Create object for dict | |
if objects: | |
obj = { | |
"@id": quote(f"{name}_obj{unique_id}"), | |
"@type": ["owl:NamedIndividual", quote(name)] | |
} | |
else: | |
obj = { | |
"@id": quote(f"{name}_obj{unique_id}"), | |
"@type": ["owl:NamedIndividual", "owl:Thing"] | |
} | |
for k, v in d.items(): | |
if properties: | |
obj_prop = { | |
"@id": quote(f"{name}.{k}"), | |
"@type": [], | |
"rdfs:range": [] | |
} | |
if type(v) in [bool, int, float, str]: | |
if properties: | |
obj_prop['@type'].append("owl:DatatypeProperty") | |
obj_prop['rdfs:range'].append("rdfs:Literal") | |
obj[quote(f"{name}.{k}")] = [{"@value": v, "@type": type_map[type(v)]}] | |
elif type(v) in [dict]: | |
if properties: | |
obj_prop['@type'].append("owl:ObjectProperty") | |
obj_prop['rdfs:range'].append("owl:Thing") | |
u = "_" + str(uuid4())[:8] | |
obj[quote(f"{name}.{k}")] = [{"@id": quote(f"{name}.{k}_obj{u}")}] | |
ret += json2linked_r(v, f"{name}.{k}", u, properties=properties, objects=objects) | |
elif type(v) in [list, tuple] and len(v) > 0: | |
v = flatten(v) | |
obj[quote(f"{name}.{k}")] = list() | |
for item in v: | |
if type(item) in [bool, int, float, str]: | |
if properties: | |
obj_prop['@type'].append("owl:DatatypeProperty") | |
obj_prop['rdfs:range'].append("rdfs:Literal") | |
obj[quote(f"{name}.{k}")].append({"@value": item, "@type": type_map[type(item)]}) | |
elif type(item) in [dict, list, tuple]: | |
if properties: | |
obj_prop['@type'].append("owl:ObjectProperty") | |
obj_prop['rdfs:range'].append("owl:Thing") | |
u = "_" + str(uuid4())[:8] | |
obj[quote(f"{name}.{k}")].append({"@id": quote(f"{name}.{k}_obj{u}")}) | |
ret += json2linked_r(item, f"{name}.{k}", u, properties=properties, objects=objects) | |
else: | |
pass # skipping 'none' objects | |
if properties: | |
obj_prop['rdfs:range'] = list(set(obj_prop['rdfs:range'])) # make unique | |
obj_prop['@type'] = list(set(obj_prop['@type'])) # make unique | |
if len(obj_prop['@type']) > 1: | |
logging.warning("object property {0}, ({1}) has @type longer than 1. That may cause problems.".format(obj_prop["@id"], obj_prop['@type'])) | |
obj_prop['rdfs:range'] = [{"@id": rng} for rng in obj_prop['rdfs:range']] # add dict here because earlier breaks set function 3 lines above | |
ret.append(obj_prop) | |
# Add in objects (or add type to property objects) | |
if objects: | |
try: | |
i = [j['@id'] for j in ret].index(quote(f"{name}.{k}")) | |
obj_obj = ret[i] | |
obj_obj['@type'].append("owl:Thing") | |
obj_obj['@type'] = list(set(obj_obj['@type'])) | |
ret[i] = obj_obj | |
except ValueError: | |
#print(ret, "\n") | |
#raise | |
ret.append({ | |
'@id': quote(f"{name}.{k}"), | |
'@type': ["owl:Thing"] | |
}) | |
ret.append(obj) | |
elif type(d) in [bool, int, float, str]: | |
logging.warning("Shouldn't get here.") | |
elif type(d) in [list, tuple] and len(d) > 0: | |
d = flatten(d) | |
for item in d: | |
if type(item) is dict and len(item) > 0: | |
ret += json2linked_r(item, name, "", properties=properties, objects=objects) | |
elif type(item) in [list, tuple]: | |
logging.warning("Flatten should prevent you getting a list of lists. name: {0}, d: {1}".format(name, d)) | |
elif type(item) in [bool, int, float, str]: | |
logging.warning("I don't think we should get a list of values. name: {0}, d: {1}".format(name, d)) | |
else: | |
pass # skip 'none' type | |
else: | |
pass # skip 'none' | |
#logging.warning("json schema type 'null' not currently supported. d: {0}, name: {1}, parent: {2}".format(d, name, parent)) | |
except: | |
print("d: {0}, name: {1}".format(d, name)) | |
raise | |
return(ret) | |
# Build JSONLD Graph | |
graph = {"@context": {}, "@graph": []} | |
# we query the records separately so that the property paths are the same for each record | |
for name, record in records_dict.items(): | |
u = "_" + str(uuid4())[:8] | |
j = json.loads(record) | |
j['record'] = name | |
# set set a unique_id so root records are unique. | |
# we set objects = True so we can identify the root by it's object type | |
# We set simple_ids = False to prevent collisions in IDs between the separately queried records | |
# We _could_ set properties = True so that we could query paths of subProperties of the top properties | |
jld = json2linked(j, "root", "https://example.org/ns#", unique_id=u, objects=True, simple_ids=False) | |
graph['@context'] = jld['@context'] | |
graph['@graph'] += jld['@graph'] | |
del(j) | |
del(jld) | |
# because we joined records with the same structure, objects and properties will be duplicated. | |
# here we delete duplicates | |
objs = set() | |
graph2 = list() | |
for obj in victor['@graph']: | |
if obj['@id'] not in objs: | |
objs.add(obj['@id']) | |
graph2.append(obj) | |
graph['@graph'] = graph2 | |
del(graph2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment