Last active
November 8, 2021 03:45
-
-
Save lordlinus/2189b16392815eaacecd531a701ee95b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# csv2hashed.py | |
# | |
# Usage: | |
# $ csv2hashed.py --salt SALT --input original.csv --output hashedfile.csv | |
# | |
# Assumptions: | |
# - This script performs basic normalization and is configured to read first_name, last_name and birthdate columns only | |
# - This script assumes the first column is always PersonID which | |
# will NOT be hashed. | |
# - Empty values (blank, undefined, null) should not be | |
# hashed. This script will fail if it encounters, adjust this to an agreed value | |
import argparse | |
import base64 | |
import csv | |
import hashlib | |
import re | |
import sys | |
from datetime import datetime | |
parser = argparse.ArgumentParser( | |
description= | |
"A tool to hash records in a csv file" | |
) | |
parser.add_argument('-u', '--salt', help='Salt value', required=True) | |
parser.add_argument('-i', | |
'--input', | |
help='Read from filename. The file must be readable ' | |
'and in CSV format encoded in UTF-8', | |
type=argparse.FileType('rt', encoding='UTF-8'), | |
default=sys.stdin, | |
required=False) | |
parser.add_argument('-o', | |
'--output', | |
help='Write the hashed results to filename in CSV format', | |
type=argparse.FileType('wt', encoding='UTF-8'), | |
default=sys.stdout, | |
required=False) | |
args = parser.parse_args() | |
pattern = re.compile(r"[^a-z]") | |
# Function to remove non english alphabets from the text, titles and suffixes | |
def normalise_text(text: str) -> str: | |
if text not in ['-', 'NA', ' ']: | |
if '.' in text: | |
lower_text = "".join(text.split('.')[1:]).lower() | |
else: | |
lower_text = text.lower() | |
return pattern.sub("", lower_text) | |
else: | |
print('Empty value encountered for the field') | |
sys.exit(1) | |
date_format = "%d/%m/%Y" # 20/10/1928 | |
# Function to normalise the date field | |
def normalise_date(date_string: str) -> str: | |
try: | |
date_obj = datetime.strptime(date_string, date_format) | |
formatted_date = date_obj.strftime("%Y-%m-%d") | |
# Checking to ensure the lenght is 10 | |
assert (len(formatted_date) == 10) | |
return formatted_date | |
except ValueError: | |
print( | |
'Date time text: {} does not match with {} format. Change "date_format" variable' | |
.format(date_string, date_format)) | |
sys.exit(1) | |
# use SHA-512 hash | |
def hash(text: str) -> str: | |
return base64.b64encode(hashlib.sha512( | |
text.encode('utf-8')).digest()).decode('utf-8') | |
# Read CSV file with personid, first_name, last_name and birth_date colums | |
# normalise, combine, apply salt value and hash | |
# Write CSV file with personid and custom_name ( hashed value) | |
with open(args.output.name, mode='w') as csvwriterfile: | |
writer = csv.DictWriter(csvwriterfile, | |
fieldnames=['personid', 'custom_name']) | |
writer.writeheader() | |
with open(args.input.name, mode='r') as csvreaderfile: | |
print("Reading csv file '{}'".format(args.input.name)) | |
reader = csv.DictReader(csvreaderfile) | |
for row in reader: | |
# print(row['personid'], row['first_name'], row['last_name'], | |
# row['birth_date']) | |
concat_string = normalise_text(row['first_name']) + normalise_text( | |
row['last_name']) + normalise_date(row['birth_date']) | |
hashed_value = hash(concat_string + args.salt) | |
writer.writerow({ | |
'personid': row['personid'], | |
'custom_name': hashed_value | |
}) | |
print("Hashed csv file saved as: '{}'".format(args.output.name)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment