callahantiff · October 13, 2021 18:47 · callahantiff · Oct 13, 2021
diff --git a/repairing_pkt_metadata_files.py b/repairing_pkt_metadata_files.py
 # Script Purpose: Script was build to address https://github.com/callahantiff/PheKnowLator/issues/116

 # import needed libraries
 import json
 import os
 import re
 import pandas as pd
 import pickle
 import shutil

 from datetime import datetime
 from google.cloud import storage  # type: ignore
 from rdflib import Graph, Literal, Namespace, URIRef   # type: ignore
 from rdflib.namespace import RDF, RDFS, OWL  # type: ignore
 from tqdm import tqdm

 from builds.build_utilities import *  # type: ignore
 from pkt_kg.__version__ import __version__  # type: ignore
 from pkt_kg.utils import *  # type: ignore

 log_dir = 'temp/'
 release = 'release_v' + __version__; bucket = storage.Client().get_bucket('pheknowlator')
 bucket_files = [file.name.split('/')[2] for file in bucket.list_blobs(prefix='archived_builds/' + release + '/')]
 builds = [x[0] for x in [re.findall(r'(?<=_)\d.*', x) for x in bucket_files] if len(x) > 0]
 sorted_dates = sorted([datetime.strftime(datetime.strptime(str(x), '%d%b%Y'), '%Y-%m-%d').upper() for x in builds])
 build = 'build_' + datetime.strftime(datetime.strptime(sorted_dates[-1], '%Y-%m-%d'), '%d%b%Y').upper()

 # arch_builds = [file.name for file in bucket.list_blobs(prefix='archived_builds/release_v2.0.0/build_11FEB2021/')]
 # y = sorted([x for x in arch_builds if x.endswith('NodeLabels.txt') or x.endswith('node_metadata_dict.pkl') or
 #      x.endswith('Triples_Integer_Identifier_Map.json')])
 # for i in y: print("'" + i + "'")

 directories = directory_list
 for i in tqdm(range(0, len(directories))):
    result_set = directories[i]; meta_dict, node_label_file, node_int_map = None, None, None
    print('\n\n*** PROCESSING SET: {} ***'.format('/'.join(result_set[0].split('/')[0:7])))
    for f in result_set:
        if f.endswith('NodeLabels.txt'):
            node_label_org = '/'.join(f.split('/')[:-1]); node_label = f.split('/')[-1]
        if f.endswith('node_metadata_dict.pkl'):
            node_meta_org = '/'.join(f.split('/')[:-1]); node_meta = f.split('/')[-1]
            bucket.blob(f).download_to_filename(log_dir + node_meta)
            meta_dict = pickle.load(open(log_dir + node_meta, 'rb'))
        if f.endswith('Triples_Integer_Identifier_Map.json'):
            node_int_org = '/'.join(f.split('/')[:-1]); node_int = f.split('/')[-1]
            bucket.blob(f).download_to_filename(log_dir + node_int)
            node_int_map = json.load(open(log_dir + node_int, 'r'))

    if meta_dict is not None and node_label is not None and node_int_map is not None:
        # fix the file metadata dictionary
        if len(meta_dict['nodes']) > 0 and len(meta_dict['relations']) > 0:
            print('Updating node_metadata_dict')
            temp_copy = meta_dict.copy(); meta_dict = dict()
            for key, value in temp_copy.items():
                meta_dict[key] = {}
                for e_key, val in value.items():
                    d = {k: re.sub('\s\s+', ' ', v.replace('\n', ' ')) if v is not None else v for k, v in val.items()}
                    meta_dict[key][e_key] = d
            del temp_copy; pickle.dump(meta_dict, open(log_dir + node_meta, 'wb'), protocol=4)
            uploads_data_to_gcs_bucket(bucket, node_meta_org + '/', log_dir, node_meta)
        else: raise ValueError('node_metadata_dict is empty!')

        # update node metadata list node list
        if len(node_int_map) > 0:
            print('Generating Updated NodeLabels.txt')
            entity_type = 'n3' if list(node_int_map.keys())[0].startswith('<') else 'string'
            if entity_type != 'n3': entities = set(k for k in node_int_map.keys() if k.startswith('http'))
            else: entities = set(URIRef(k.strip('<').strip('>')) for k in node_int_map.keys() if k.startswith('<http'))
            with open(log_dir + node_label, 'w', encoding='utf-8') as out:
                out.write('entity_type' + '\t' + 
                          'integer_id' + '\t' + 
                          'entity_uri' + '\t' + 
                          'label' + '\t' + 
                          'description/definition' + '\t' + 
                          'synonym' + '\n')
                for x in entities:
                    if entity_type != 'n3': nid, nint = str(x), node_int_map[str(x)]
                    else: nid, nint = n3(x), node_int_map[n3(x)]
                    if str(x) in meta_dict['nodes'].keys(): etyp, meta = 'NODES', meta_dict['nodes'][str(x)]
                    elif str(x) in meta_dict['relations'].keys(): etyp, meta = 'RELATIONS', meta_dict['relations'][str(x)]
                    else: meta, etyp, lab, dsc, syn = None, 'NA', 'NA', 'NA', 'NA'
                    if meta is not None:
                        lab = meta['Label'] if meta['Label'] is not None else 'None'
                        dsc = meta['Description'] if meta['Description'] is not None else 'None'
                        syn = meta['Synonym'] if meta['Synonym'] is not None else 'None'
                    try: out.write(etyp + '\t' + str(nint) + '\t' + nid + '\t' + lab + '\t' + dsc + '\t' + syn + '\n')
                    except UnicodeEncodeError:
                        out.write(etyp.encode().decode() + '\t' + str(nint).encode().decode() +
                                  '\t' + nid.encode().decode() + '\t' + lab.encode().decode() +
                                  '\t' + dsc.encode().decode() + '\t' + syn.encode().decode() + '\n')
            out.close()
            # upload cleaned node list to gcs bucket
            header_key = list(meta_dict['nodes'].keys())[0]; headings = list(meta_dict['nodes'][header_key].keys())
            df = pd.read_csv(log_dir + node_label, header=0, sep='\t'); df_columns = list(df.columns)
            if len(headings) + 3 == len(df_columns) and len(df) > 0:
                uploads_data_to_gcs_bucket(bucket, node_label_org + '/', log_dir, node_label)
                shutil.rmtree(log_dir); os.mkdir(log_dir)  # remove directory
            else: print(df_columns); raise ValueError('DF has the incorrect number of columns')
        else: raise ValueError('Triples_Integer_Identifier_Map.json is empty!')
    else: raise ValueError('One of the files did not download correctly!')
	# Script Purpose: Script was build to address https://github.com/callahantiff/PheKnowLator/issues/116

	# import needed libraries
	import json
	import os
	import re
	import pandas as pd
	import pickle
	import shutil

	from datetime import datetime
	from google.cloud import storage # type: ignore
	from rdflib import Graph, Literal, Namespace, URIRef # type: ignore
	from rdflib.namespace import RDF, RDFS, OWL # type: ignore
	from tqdm import tqdm

	from builds.build_utilities import * # type: ignore
	from pkt_kg.__version__ import __version__ # type: ignore
	from pkt_kg.utils import * # type: ignore

	log_dir = 'temp/'
	release = 'release_v' + __version__; bucket = storage.Client().get_bucket('pheknowlator')
	bucket_files = [file.name.split('/')[2] for file in bucket.list_blobs(prefix='archived_builds/' + release + '/')]
	builds = [x[0] for x in [re.findall(r'(?<=_)\d.*', x) for x in bucket_files] if len(x) > 0]
	sorted_dates = sorted([datetime.strftime(datetime.strptime(str(x), '%d%b%Y'), '%Y-%m-%d').upper() for x in builds])
	build = 'build_' + datetime.strftime(datetime.strptime(sorted_dates[-1], '%Y-%m-%d'), '%d%b%Y').upper()

	# arch_builds = [file.name for file in bucket.list_blobs(prefix='archived_builds/release_v2.0.0/build_11FEB2021/')]
	# y = sorted([x for x in arch_builds if x.endswith('NodeLabels.txt') or x.endswith('node_metadata_dict.pkl') or
	# x.endswith('Triples_Integer_Identifier_Map.json')])
	# for i in y: print("'" + i + "'")

	directories = directory_list
	for i in tqdm(range(0, len(directories))):
	result_set = directories[i]; meta_dict, node_label_file, node_int_map = None, None, None
	print('\n\n* PROCESSING SET: {} *'.format('/'.join(result_set[0].split('/')[0:7])))
	for f in result_set:
	if f.endswith('NodeLabels.txt'):
	node_label_org = '/'.join(f.split('/')[:-1]); node_label = f.split('/')[-1]
	if f.endswith('node_metadata_dict.pkl'):
	node_meta_org = '/'.join(f.split('/')[:-1]); node_meta = f.split('/')[-1]
	bucket.blob(f).download_to_filename(log_dir + node_meta)
	meta_dict = pickle.load(open(log_dir + node_meta, 'rb'))
	if f.endswith('Triples_Integer_Identifier_Map.json'):
	node_int_org = '/'.join(f.split('/')[:-1]); node_int = f.split('/')[-1]
	bucket.blob(f).download_to_filename(log_dir + node_int)
	node_int_map = json.load(open(log_dir + node_int, 'r'))

	if meta_dict is not None and node_label is not None and node_int_map is not None:
	# fix the file metadata dictionary
	if len(meta_dict['nodes']) > 0 and len(meta_dict['relations']) > 0:
	print('Updating node_metadata_dict')
	temp_copy = meta_dict.copy(); meta_dict = dict()
	for key, value in temp_copy.items():
	meta_dict[key] = {}
	for e_key, val in value.items():
	d = {k: re.sub('\s\s+', ' ', v.replace('\n', ' ')) if v is not None else v for k, v in val.items()}
	meta_dict[key][e_key] = d
	del temp_copy; pickle.dump(meta_dict, open(log_dir + node_meta, 'wb'), protocol=4)
	uploads_data_to_gcs_bucket(bucket, node_meta_org + '/', log_dir, node_meta)
	else: raise ValueError('node_metadata_dict is empty!')

	# update node metadata list node list
	if len(node_int_map) > 0:
	print('Generating Updated NodeLabels.txt')
	entity_type = 'n3' if list(node_int_map.keys())[0].startswith('<') else 'string'
	if entity_type != 'n3': entities = set(k for k in node_int_map.keys() if k.startswith('http'))
	else: entities = set(URIRef(k.strip('<').strip('>')) for k in node_int_map.keys() if k.startswith('<http'))
	with open(log_dir + node_label, 'w', encoding='utf-8') as out:
	out.write('entity_type' + '\t' +
	'integer_id' + '\t' +
	'entity_uri' + '\t' +
	'label' + '\t' +
	'description/definition' + '\t' +
	'synonym' + '\n')
	for x in entities:
	if entity_type != 'n3': nid, nint = str(x), node_int_map[str(x)]
	else: nid, nint = n3(x), node_int_map[n3(x)]
	if str(x) in meta_dict['nodes'].keys(): etyp, meta = 'NODES', meta_dict['nodes'][str(x)]
	elif str(x) in meta_dict['relations'].keys(): etyp, meta = 'RELATIONS', meta_dict['relations'][str(x)]
	else: meta, etyp, lab, dsc, syn = None, 'NA', 'NA', 'NA', 'NA'
	if meta is not None:
	lab = meta['Label'] if meta['Label'] is not None else 'None'
	dsc = meta['Description'] if meta['Description'] is not None else 'None'
	syn = meta['Synonym'] if meta['Synonym'] is not None else 'None'
	try: out.write(etyp + '\t' + str(nint) + '\t' + nid + '\t' + lab + '\t' + dsc + '\t' + syn + '\n')
	except UnicodeEncodeError:
	out.write(etyp.encode().decode() + '\t' + str(nint).encode().decode() +
	'\t' + nid.encode().decode() + '\t' + lab.encode().decode() +
	'\t' + dsc.encode().decode() + '\t' + syn.encode().decode() + '\n')
	out.close()
	# upload cleaned node list to gcs bucket
	header_key = list(meta_dict['nodes'].keys())[0]; headings = list(meta_dict['nodes'][header_key].keys())
	df = pd.read_csv(log_dir + node_label, header=0, sep='\t'); df_columns = list(df.columns)
	if len(headings) + 3 == len(df_columns) and len(df) > 0:
	uploads_data_to_gcs_bucket(bucket, node_label_org + '/', log_dir, node_label)
	shutil.rmtree(log_dir); os.mkdir(log_dir) # remove directory
	else: print(df_columns); raise ValueError('DF has the incorrect number of columns')
	else: raise ValueError('Triples_Integer_Identifier_Map.json is empty!')
	else: raise ValueError('One of the files did not download correctly!')