Created
December 15, 2017 14:41
-
-
Save maxfischer2781/25e0cf9e511833c03183b65da1d3845d to your computer and use it in GitHub Desktop.
Anonymization for job statistics from HTCondor
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import sys | |
import json | |
import random | |
import time | |
import argparse | |
import hashlib | |
def load_data(source): | |
""" | |
Load JSON formatted data from a file path or stdin | |
:param str source: a file path or ``"-"`` for stdin | |
:return: list of classads for all jobs | |
:rtype: list[dict] | |
""" | |
if source == '-': | |
return json.load(sys.stdin) | |
else: | |
with open(source, 'r') as file_source: | |
return json.load(file_source) | |
def write_data(dest, data, pretty): | |
""" | |
Write JSON formatted data to a file path or stdout | |
:param str dest: a file path or ``"-"`` for stdout | |
:param data: list of classads for all jobs | |
:type data: list[dict] | |
:param bool pretty: whether to use human-readable formattting | |
""" | |
if pretty: | |
kwargs = {'indent': 1, 'separators': (',', ': ')} | |
else: | |
kwargs = {} | |
if dest == '-': | |
json.dump(data, sys.stdout, **kwargs) | |
else: | |
with open(dest, 'w') as file_dest: | |
json.dump(data, file_dest, **kwargs) | |
def anonymise(all_data, salt, attributes): | |
""" | |
Anonymise every ``attribute`` for each job in ``all_data`` | |
:param all_data: list of classads for all jobs | |
:type all_data: list[dict] | |
:param str salt: data to insert when anonymising attributes | |
:param iterable[str] attributes: names of attributes to anonymise | |
:return: list of classads for all jobs | |
:rtype: list[dict] | |
""" | |
attributes = set(attributes) | |
anon_data = [] | |
for job_data in all_data: | |
anon_data.append({ | |
key: anonymise_item(salt, key, value, attributes) | |
for key, value in job_data.items() | |
}) | |
return anon_data | |
def anonymise_item(salt, key, value, attributes): | |
"""Anonymise an item as necessary""" | |
try: | |
return DISPATCH_ATTRIBUTES[key](salt, value) | |
except KeyError: | |
if key in attributes: | |
return anonymise_value(salt, value) | |
else: | |
return value | |
def anonymise_value(salt, value): | |
anon = hashlib.sha1() | |
anon.update(salt[::2]) | |
anon.update(value) | |
anon.update(salt[1::2]) | |
return anon.hexdigest() | |
def anonymise_dotted(salt, value): | |
parts = value.split('.') | |
return '.'.join([anonymise_value(salt, parts[0])] + parts[1:]) | |
def main(): | |
options = CLI.parse_args() | |
data = load_data(options.source) | |
data = anonymise(data, options.salt, options.attributes) | |
write_data(options.dest, data, options.pretty) | |
#: names of attribute to anonymise | |
SENSITIVE_ATTRIBUTES = [ | |
# user information | |
'AccountingGroup', 'AcctGroup', 'AcctGroupUser', 'Owner' | |
# paths | |
'Cmd', 'Err', 'In', 'Iwd', 'Out', 'TransferInput', 'UserLog' | |
# job information | |
'GlobalJobId', 'JobDescription', 'Environment', | |
] | |
#: special anonymisation for individual attributes | |
DISPATCH_ATTRIBUTES = { | |
'AcctGroup': anonymise_dotted | |
} | |
CLI = argparse.ArgumentParser("anonymise results of condor_q/condor_history json dumps") | |
CLI.add_argument( | |
'--attributes', | |
help='Attributes to anonymise', | |
nargs='*', | |
default=SENSITIVE_ATTRIBUTES | |
) | |
CLI.add_argument( | |
'--salt', | |
help='salt base to use for hashing', | |
default='%010d%06d' % (time.time(), random.random() * 100000) | |
) | |
CLI.add_argument( | |
'--source', | |
help='path to json input data or - for stdin', | |
default='-', | |
) | |
CLI.add_argument( | |
'--dest', | |
help='path to json output data or - for stdout', | |
default='-', | |
) | |
CLI.add_argument( | |
'--pretty', | |
help='pretty-print output', | |
action='store_true', | |
) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment