Created
October 13, 2021 18:47
-
-
Save callahantiff/f31a936d967c8f44d5d0e51c563ae65f to your computer and use it in GitHub Desktop.
PheKnowLator - repairing metadata files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Script Purpose: Script was build to address https://github.com/callahantiff/PheKnowLator/issues/116 | |
# import needed libraries | |
import json | |
import os | |
import re | |
import pandas as pd | |
import pickle | |
import shutil | |
from datetime import datetime | |
from google.cloud import storage # type: ignore | |
from rdflib import Graph, Literal, Namespace, URIRef # type: ignore | |
from rdflib.namespace import RDF, RDFS, OWL # type: ignore | |
from tqdm import tqdm | |
from builds.build_utilities import * # type: ignore | |
from pkt_kg.__version__ import __version__ # type: ignore | |
from pkt_kg.utils import * # type: ignore | |
log_dir = 'temp/' | |
release = 'release_v' + __version__; bucket = storage.Client().get_bucket('pheknowlator') | |
bucket_files = [file.name.split('/')[2] for file in bucket.list_blobs(prefix='archived_builds/' + release + '/')] | |
builds = [x[0] for x in [re.findall(r'(?<=_)\d.*', x) for x in bucket_files] if len(x) > 0] | |
sorted_dates = sorted([datetime.strftime(datetime.strptime(str(x), '%d%b%Y'), '%Y-%m-%d').upper() for x in builds]) | |
build = 'build_' + datetime.strftime(datetime.strptime(sorted_dates[-1], '%Y-%m-%d'), '%d%b%Y').upper() | |
# arch_builds = [file.name for file in bucket.list_blobs(prefix='archived_builds/release_v2.0.0/build_11FEB2021/')] | |
# y = sorted([x for x in arch_builds if x.endswith('NodeLabels.txt') or x.endswith('node_metadata_dict.pkl') or | |
# x.endswith('Triples_Integer_Identifier_Map.json')]) | |
# for i in y: print("'" + i + "'") | |
directories = directory_list | |
for i in tqdm(range(0, len(directories))): | |
result_set = directories[i]; meta_dict, node_label_file, node_int_map = None, None, None | |
print('\n\n*** PROCESSING SET: {} ***'.format('/'.join(result_set[0].split('/')[0:7]))) | |
for f in result_set: | |
if f.endswith('NodeLabels.txt'): | |
node_label_org = '/'.join(f.split('/')[:-1]); node_label = f.split('/')[-1] | |
if f.endswith('node_metadata_dict.pkl'): | |
node_meta_org = '/'.join(f.split('/')[:-1]); node_meta = f.split('/')[-1] | |
bucket.blob(f).download_to_filename(log_dir + node_meta) | |
meta_dict = pickle.load(open(log_dir + node_meta, 'rb')) | |
if f.endswith('Triples_Integer_Identifier_Map.json'): | |
node_int_org = '/'.join(f.split('/')[:-1]); node_int = f.split('/')[-1] | |
bucket.blob(f).download_to_filename(log_dir + node_int) | |
node_int_map = json.load(open(log_dir + node_int, 'r')) | |
if meta_dict is not None and node_label is not None and node_int_map is not None: | |
# fix the file metadata dictionary | |
if len(meta_dict['nodes']) > 0 and len(meta_dict['relations']) > 0: | |
print('Updating node_metadata_dict') | |
temp_copy = meta_dict.copy(); meta_dict = dict() | |
for key, value in temp_copy.items(): | |
meta_dict[key] = {} | |
for e_key, val in value.items(): | |
d = {k: re.sub('\s\s+', ' ', v.replace('\n', ' ')) if v is not None else v for k, v in val.items()} | |
meta_dict[key][e_key] = d | |
del temp_copy; pickle.dump(meta_dict, open(log_dir + node_meta, 'wb'), protocol=4) | |
uploads_data_to_gcs_bucket(bucket, node_meta_org + '/', log_dir, node_meta) | |
else: raise ValueError('node_metadata_dict is empty!') | |
# update node metadata list node list | |
if len(node_int_map) > 0: | |
print('Generating Updated NodeLabels.txt') | |
entity_type = 'n3' if list(node_int_map.keys())[0].startswith('<') else 'string' | |
if entity_type != 'n3': entities = set(k for k in node_int_map.keys() if k.startswith('http')) | |
else: entities = set(URIRef(k.strip('<').strip('>')) for k in node_int_map.keys() if k.startswith('<http')) | |
with open(log_dir + node_label, 'w', encoding='utf-8') as out: | |
out.write('entity_type' + '\t' + | |
'integer_id' + '\t' + | |
'entity_uri' + '\t' + | |
'label' + '\t' + | |
'description/definition' + '\t' + | |
'synonym' + '\n') | |
for x in entities: | |
if entity_type != 'n3': nid, nint = str(x), node_int_map[str(x)] | |
else: nid, nint = n3(x), node_int_map[n3(x)] | |
if str(x) in meta_dict['nodes'].keys(): etyp, meta = 'NODES', meta_dict['nodes'][str(x)] | |
elif str(x) in meta_dict['relations'].keys(): etyp, meta = 'RELATIONS', meta_dict['relations'][str(x)] | |
else: meta, etyp, lab, dsc, syn = None, 'NA', 'NA', 'NA', 'NA' | |
if meta is not None: | |
lab = meta['Label'] if meta['Label'] is not None else 'None' | |
dsc = meta['Description'] if meta['Description'] is not None else 'None' | |
syn = meta['Synonym'] if meta['Synonym'] is not None else 'None' | |
try: out.write(etyp + '\t' + str(nint) + '\t' + nid + '\t' + lab + '\t' + dsc + '\t' + syn + '\n') | |
except UnicodeEncodeError: | |
out.write(etyp.encode().decode() + '\t' + str(nint).encode().decode() + | |
'\t' + nid.encode().decode() + '\t' + lab.encode().decode() + | |
'\t' + dsc.encode().decode() + '\t' + syn.encode().decode() + '\n') | |
out.close() | |
# upload cleaned node list to gcs bucket | |
header_key = list(meta_dict['nodes'].keys())[0]; headings = list(meta_dict['nodes'][header_key].keys()) | |
df = pd.read_csv(log_dir + node_label, header=0, sep='\t'); df_columns = list(df.columns) | |
if len(headings) + 3 == len(df_columns) and len(df) > 0: | |
uploads_data_to_gcs_bucket(bucket, node_label_org + '/', log_dir, node_label) | |
shutil.rmtree(log_dir); os.mkdir(log_dir) # remove directory | |
else: print(df_columns); raise ValueError('DF has the incorrect number of columns') | |
else: raise ValueError('Triples_Integer_Identifier_Map.json is empty!') | |
else: raise ValueError('One of the files did not download correctly!') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This Gist is part of the solution for the following issue #116 in the PheKnowLator repository