Skip to content

Instantly share code, notes, and snippets.

@lordlinus
Last active November 8, 2021 03:45
Show Gist options
  • Save lordlinus/2189b16392815eaacecd531a701ee95b to your computer and use it in GitHub Desktop.
Save lordlinus/2189b16392815eaacecd531a701ee95b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# csv2hashed.py
#
# Usage:
# $ csv2hashed.py --salt SALT --input original.csv --output hashedfile.csv
#
# Assumptions:
# - This script performs basic normalization and is configured to read first_name, last_name and birthdate columns only
# - This script assumes the first column is always PersonID which
# will NOT be hashed.
# - Empty values (blank, undefined, null) should not be
# hashed. This script will fail if it encounters, adjust this to an agreed value
import argparse
import base64
import csv
import hashlib
import re
import sys
from datetime import datetime
parser = argparse.ArgumentParser(
description=
"A tool to hash records in a csv file"
)
parser.add_argument('-u', '--salt', help='Salt value', required=True)
parser.add_argument('-i',
'--input',
help='Read from filename. The file must be readable '
'and in CSV format encoded in UTF-8',
type=argparse.FileType('rt', encoding='UTF-8'),
default=sys.stdin,
required=False)
parser.add_argument('-o',
'--output',
help='Write the hashed results to filename in CSV format',
type=argparse.FileType('wt', encoding='UTF-8'),
default=sys.stdout,
required=False)
args = parser.parse_args()
pattern = re.compile(r"[^a-z]")
# Function to remove non english alphabets from the text, titles and suffixes
def normalise_text(text: str) -> str:
if text not in ['-', 'NA', ' ']:
if '.' in text:
lower_text = "".join(text.split('.')[1:]).lower()
else:
lower_text = text.lower()
return pattern.sub("", lower_text)
else:
print('Empty value encountered for the field')
sys.exit(1)
date_format = "%d/%m/%Y" # 20/10/1928
# Function to normalise the date field
def normalise_date(date_string: str) -> str:
try:
date_obj = datetime.strptime(date_string, date_format)
formatted_date = date_obj.strftime("%Y-%m-%d")
# Checking to ensure the lenght is 10
assert (len(formatted_date) == 10)
return formatted_date
except ValueError:
print(
'Date time text: {} does not match with {} format. Change "date_format" variable'
.format(date_string, date_format))
sys.exit(1)
# use SHA-512 hash
def hash(text: str) -> str:
return base64.b64encode(hashlib.sha512(
text.encode('utf-8')).digest()).decode('utf-8')
# Read CSV file with personid, first_name, last_name and birth_date colums
# normalise, combine, apply salt value and hash
# Write CSV file with personid and custom_name ( hashed value)
with open(args.output.name, mode='w') as csvwriterfile:
writer = csv.DictWriter(csvwriterfile,
fieldnames=['personid', 'custom_name'])
writer.writeheader()
with open(args.input.name, mode='r') as csvreaderfile:
print("Reading csv file '{}'".format(args.input.name))
reader = csv.DictReader(csvreaderfile)
for row in reader:
# print(row['personid'], row['first_name'], row['last_name'],
# row['birth_date'])
concat_string = normalise_text(row['first_name']) + normalise_text(
row['last_name']) + normalise_date(row['birth_date'])
hashed_value = hash(concat_string + args.salt)
writer.writerow({
'personid': row['personid'],
'custom_name': hashed_value
})
print("Hashed csv file saved as: '{}'".format(args.output.name))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment