Skip to content

Instantly share code, notes, and snippets.

@maxfischer2781
Created December 15, 2017 14:41
Show Gist options
  • Save maxfischer2781/25e0cf9e511833c03183b65da1d3845d to your computer and use it in GitHub Desktop.
Save maxfischer2781/25e0cf9e511833c03183b65da1d3845d to your computer and use it in GitHub Desktop.
Anonymization for job statistics from HTCondor
#!/usr/bin/python
import sys
import json
import random
import time
import argparse
import hashlib
def load_data(source):
"""
Load JSON formatted data from a file path or stdin
:param str source: a file path or ``"-"`` for stdin
:return: list of classads for all jobs
:rtype: list[dict]
"""
if source == '-':
return json.load(sys.stdin)
else:
with open(source, 'r') as file_source:
return json.load(file_source)
def write_data(dest, data, pretty):
"""
Write JSON formatted data to a file path or stdout
:param str dest: a file path or ``"-"`` for stdout
:param data: list of classads for all jobs
:type data: list[dict]
:param bool pretty: whether to use human-readable formattting
"""
if pretty:
kwargs = {'indent': 1, 'separators': (',', ': ')}
else:
kwargs = {}
if dest == '-':
json.dump(data, sys.stdout, **kwargs)
else:
with open(dest, 'w') as file_dest:
json.dump(data, file_dest, **kwargs)
def anonymise(all_data, salt, attributes):
"""
Anonymise every ``attribute`` for each job in ``all_data``
:param all_data: list of classads for all jobs
:type all_data: list[dict]
:param str salt: data to insert when anonymising attributes
:param iterable[str] attributes: names of attributes to anonymise
:return: list of classads for all jobs
:rtype: list[dict]
"""
attributes = set(attributes)
anon_data = []
for job_data in all_data:
anon_data.append({
key: anonymise_item(salt, key, value, attributes)
for key, value in job_data.items()
})
return anon_data
def anonymise_item(salt, key, value, attributes):
"""Anonymise an item as necessary"""
try:
return DISPATCH_ATTRIBUTES[key](salt, value)
except KeyError:
if key in attributes:
return anonymise_value(salt, value)
else:
return value
def anonymise_value(salt, value):
anon = hashlib.sha1()
anon.update(salt[::2])
anon.update(value)
anon.update(salt[1::2])
return anon.hexdigest()
def anonymise_dotted(salt, value):
parts = value.split('.')
return '.'.join([anonymise_value(salt, parts[0])] + parts[1:])
def main():
options = CLI.parse_args()
data = load_data(options.source)
data = anonymise(data, options.salt, options.attributes)
write_data(options.dest, data, options.pretty)
#: names of attribute to anonymise
SENSITIVE_ATTRIBUTES = [
# user information
'AccountingGroup', 'AcctGroup', 'AcctGroupUser', 'Owner'
# paths
'Cmd', 'Err', 'In', 'Iwd', 'Out', 'TransferInput', 'UserLog'
# job information
'GlobalJobId', 'JobDescription', 'Environment',
]
#: special anonymisation for individual attributes
DISPATCH_ATTRIBUTES = {
'AcctGroup': anonymise_dotted
}
CLI = argparse.ArgumentParser("anonymise results of condor_q/condor_history json dumps")
CLI.add_argument(
'--attributes',
help='Attributes to anonymise',
nargs='*',
default=SENSITIVE_ATTRIBUTES
)
CLI.add_argument(
'--salt',
help='salt base to use for hashing',
default='%010d%06d' % (time.time(), random.random() * 100000)
)
CLI.add_argument(
'--source',
help='path to json input data or - for stdin',
default='-',
)
CLI.add_argument(
'--dest',
help='path to json output data or - for stdout',
default='-',
)
CLI.add_argument(
'--pretty',
help='pretty-print output',
action='store_true',
)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment