lordlinus · November 8, 2021 03:45
diff --git a/csv2hashed.py b/csv2hashed.py
 #!/usr/bin/env python

 # csv2hashed.py
 #
 # Usage:
 #   $ csv2hashed.py --salt SALT --input original.csv --output hashedfile.csv
 #
 # Assumptions:
 #   - This script performs basic normalization and is configured to read first_name, last_name and birthdate columns only
 #   - This script assumes the first column is always PersonID which
 #     will NOT be hashed.
 #   - Empty values (blank, undefined, null) should not be
 #     hashed. This script will fail if it encounters, adjust this to an agreed value

 import argparse
 import base64
 import csv
 import hashlib
 import re
 import sys
 from datetime import datetime

 parser = argparse.ArgumentParser(
    description=
    "A tool to hash records in a csv file"
 )
 parser.add_argument('-u', '--salt', help='Salt value', required=True)
 parser.add_argument('-i',
                    '--input',
                    help='Read from filename. The file must be readable '
                    'and in CSV format encoded in UTF-8',
                    type=argparse.FileType('rt', encoding='UTF-8'),
                    default=sys.stdin,
                    required=False)
 parser.add_argument('-o',
                    '--output',
                    help='Write the hashed results to filename in CSV format',
                    type=argparse.FileType('wt', encoding='UTF-8'),
                    default=sys.stdout,
                    required=False)
 args = parser.parse_args()

 pattern = re.compile(r"[^a-z]")

 # Function to remove non english alphabets from the text, titles and suffixes
 def normalise_text(text: str) -> str:
    if text not in ['-', 'NA', ' ']:
        if '.' in text:
            lower_text = "".join(text.split('.')[1:]).lower()
        else:
            lower_text = text.lower()
        return pattern.sub("", lower_text)
    else:
        print('Empty value encountered for the field')
        sys.exit(1)


 date_format = "%d/%m/%Y" # 20/10/1928


 # Function to normalise the date field
 def normalise_date(date_string: str) -> str:
    try:
        date_obj = datetime.strptime(date_string, date_format)
        formatted_date = date_obj.strftime("%Y-%m-%d")
        # Checking to ensure the lenght is 10
        assert (len(formatted_date) == 10)
        return formatted_date
    except ValueError:
        print(
            'Date time text: {} does not match with {} format. Change "date_format" variable'
            .format(date_string, date_format))
        sys.exit(1)


 # use SHA-512 hash
 def hash(text: str) -> str:
    return base64.b64encode(hashlib.sha512(
        text.encode('utf-8')).digest()).decode('utf-8')


 # Read CSV file with personid, first_name, last_name and birth_date colums
 # normalise, combine, apply salt value and hash
 # Write CSV file with personid and custom_name ( hashed value)
 with open(args.output.name, mode='w') as csvwriterfile:
    writer = csv.DictWriter(csvwriterfile,
                            fieldnames=['personid', 'custom_name'])
    writer.writeheader()
    with open(args.input.name, mode='r') as csvreaderfile:
        print("Reading csv file '{}'".format(args.input.name))
        reader = csv.DictReader(csvreaderfile)
        for row in reader:
            # print(row['personid'], row['first_name'], row['last_name'],
            #       row['birth_date'])
            concat_string = normalise_text(row['first_name']) + normalise_text(
                row['last_name']) + normalise_date(row['birth_date'])
            hashed_value = hash(concat_string + args.salt)
            writer.writerow({
                'personid': row['personid'],
                'custom_name': hashed_value
            })
    print("Hashed csv file saved as: '{}'".format(args.output.name))
	#!/usr/bin/env python

	# csv2hashed.py
	#
	# Usage:
	# $ csv2hashed.py --salt SALT --input original.csv --output hashedfile.csv
	#
	# Assumptions:
	# - This script performs basic normalization and is configured to read first_name, last_name and birthdate columns only
	# - This script assumes the first column is always PersonID which
	# will NOT be hashed.
	# - Empty values (blank, undefined, null) should not be
	# hashed. This script will fail if it encounters, adjust this to an agreed value

	import argparse
	import base64
	import csv
	import hashlib
	import re
	import sys
	from datetime import datetime

	parser = argparse.ArgumentParser(
	description=
	"A tool to hash records in a csv file"
	)
	parser.add_argument('-u', '--salt', help='Salt value', required=True)
	parser.add_argument('-i',
	'--input',
	help='Read from filename. The file must be readable '
	'and in CSV format encoded in UTF-8',
	type=argparse.FileType('rt', encoding='UTF-8'),
	default=sys.stdin,
	required=False)
	parser.add_argument('-o',
	'--output',
	help='Write the hashed results to filename in CSV format',
	type=argparse.FileType('wt', encoding='UTF-8'),
	default=sys.stdout,
	required=False)
	args = parser.parse_args()

	pattern = re.compile(r"[^a-z]")

	# Function to remove non english alphabets from the text, titles and suffixes
	def normalise_text(text: str) -> str:
	if text not in ['-', 'NA', ' ']:
	if '.' in text:
	lower_text = "".join(text.split('.')[1:]).lower()
	else:
	lower_text = text.lower()
	return pattern.sub("", lower_text)
	else:
	print('Empty value encountered for the field')
	sys.exit(1)


	date_format = "%d/%m/%Y" # 20/10/1928


	# Function to normalise the date field
	def normalise_date(date_string: str) -> str:
	try:
	date_obj = datetime.strptime(date_string, date_format)
	formatted_date = date_obj.strftime("%Y-%m-%d")
	# Checking to ensure the lenght is 10
	assert (len(formatted_date) == 10)
	return formatted_date
	except ValueError:
	print(
	'Date time text: {} does not match with {} format. Change "date_format" variable'
	.format(date_string, date_format))
	sys.exit(1)


	# use SHA-512 hash
	def hash(text: str) -> str:
	return base64.b64encode(hashlib.sha512(
	text.encode('utf-8')).digest()).decode('utf-8')


	# Read CSV file with personid, first_name, last_name and birth_date colums
	# normalise, combine, apply salt value and hash
	# Write CSV file with personid and custom_name ( hashed value)
	with open(args.output.name, mode='w') as csvwriterfile:
	writer = csv.DictWriter(csvwriterfile,
	fieldnames=['personid', 'custom_name'])
	writer.writeheader()
	with open(args.input.name, mode='r') as csvreaderfile:
	print("Reading csv file '{}'".format(args.input.name))
	reader = csv.DictReader(csvreaderfile)
	for row in reader:
	# print(row['personid'], row['first_name'], row['last_name'],
	# row['birth_date'])
	concat_string = normalise_text(row['first_name']) + normalise_text(
	row['last_name']) + normalise_date(row['birth_date'])
	hashed_value = hash(concat_string + args.salt)
	writer.writerow({
	'personid': row['personid'],
	'custom_name': hashed_value
	})
	print("Hashed csv file saved as: '{}'".format(args.output.name))