Created
February 27, 2020 05:58
-
-
Save y2k-shubham/3b4d3d5c0163866e038128f6a34d32ac to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hashlib | |
import os | |
import sys | |
# ///////////////////////// | |
# file-read / write methods | |
# ///////////////////////// | |
def get_utf8_file_descriptor(file_path, mode): | |
""" | |
depending on Python version (2 / 3), returns a file | |
descriptor of passed mode (read / write) with UTF-8 | |
(non-ASCII characters) encoding support | |
.. | |
:param file_path: complete qualified file path + name | |
:type file_path: str | |
:param mode: file opening mode (like "r" for read). see available modes: | |
https://www.programiz.com/python-programming/file-operation | |
:type mode: str | |
:return: A file descriptor for operations (read / write / append etc) | |
""" | |
if sys.version_info[0] >= 3: | |
return open(file_path, mode=mode, encoding="utf-8") | |
else: | |
import io | |
return io.open(file_path, mode=mode, encoding="utf-8") | |
def read_lines(file_path): | |
""" | |
reads a file line-by-line and returns a list of strings | |
(each line represented by an element of list) | |
.. | |
:param file_path: complete qualified file path + name | |
:type file_path: str | |
:return: list of strings, where each item is a line of file | |
:type: List[str] | |
""" | |
with get_utf8_file_descriptor(file_path=file_path, mode="r") as input_file: | |
lines = input_file.read().splitlines() | |
return lines | |
def write_lines(file_path, lines): | |
""" | |
writes a list of strings item-by-item to different lines of | |
a UTF-8 compatible text file | |
:param file_path: complete qualified output file-path + name | |
:type file_path: str | |
:param lines: list of strings to be written to file | |
:type lines: List[str] | |
:return: None | |
""" | |
with get_utf8_file_descriptor(file_path=file_path, mode="wt") as output_file: | |
output_file.write("\n".join(lines)) | |
# ///////////////////////// | |
# string manipulation / hashing methods | |
# ///////////////////////// | |
def is_enclosed_by(word, punctuation="\""): | |
""" | |
Determines whether or not the given word is enclosed | |
by given puncuation character (also a string) | |
.. | |
:param word: string to be checked for enclosure | |
:type word: str | |
:param punctuation: puncuation character | |
:type punctuation: str (single-character-string) | |
:return: Boolean denoting whether or not string is enclosed | |
:type: bool | |
""" | |
begins_with_punctuation = word[0] == punctuation | |
ends_with_punctuation = word[-1] == punctuation | |
return (begins_with_punctuation and ends_with_punctuation) | |
def hash_word(word): | |
""" | |
Returns SHA-128 (SHA1) hash digest of a string | |
:param word: string to be hashed | |
:type word: str | |
:return: SHA-128 hash digest of the passed string | |
:type: str | |
""" | |
return hashlib.sha1(word.encode()).hexdigest() | |
def hash_lines(lines, column_positions_to_hash, contains_column_headers): | |
""" | |
Accepts a list of string (lines) and a list of positions (ints) | |
- Assumes each item of lines is a row of CSV | |
- Splits each line of lines by comma ',' | |
- Hashes (SHA-128) those tokens (words) of line whose position is | |
specified by column_positions_to_hash | |
- rebuilds and returns the list of strings with new (hashed) data | |
.. | |
:param lines: list of strings (lines) of a CSV file | |
:type: lines: List[str] | |
:param column_positions_to_hash: list of column positions (1-indexed) to be hashed | |
in each item (line) of lines | |
:type column_positions_to_hash: List[int] | |
:param contains_column_headers: whether or not input list of lines includes | |
(first line) as column headers | |
:type contains_column_headers: bool | |
:return: list of strings formed by hashing words at specified positions | |
:type: List[str] | |
""" | |
hashed_lines = [None] * len(lines) | |
for i in range(len(lines)): | |
line = lines[i] | |
line_tokens = line.split(",") | |
hashed_line_tokens = line_tokens[:] | |
for j in column_positions_to_hash: | |
adjusted_column_posision_to_hash = j - 1 | |
token = line_tokens[adjusted_column_posision_to_hash] | |
token_is_puncuated = is_enclosed_by(token) | |
hashed_token = hash_word(token.strip("\"")) | |
punctuated_hashed_token = "\"{}\"".format(hashed_token) if token_is_puncuated else hashed_token | |
hashed_line_tokens[adjusted_column_posision_to_hash] = punctuated_hashed_token | |
hashed_line = ",".join(hashed_line_tokens) | |
hashed_lines[i] = hashed_line | |
if contains_column_headers: | |
hashed_lines[0] = lines[0] | |
return hashed_lines | |
# ///////////////////////// | |
# main method | |
# ///////////////////////// | |
def main(): | |
# directory containing input / output files (fully qualified absolute path) | |
base_dir = "/Users/compadmin/Downloads" | |
# input file name (with extension) | |
input_file_name = "sample.csv" | |
# output file name = "input_file_name.csv.sha1" | |
output_file_name = input_file_name + ".sha1" | |
# (1-indexed) column positions to be hashed in each row | |
column_positions_to_hash = [3, 4] | |
# whether or not input file contains column headers | |
contains_column_headers = True | |
lines = read_lines(file_path=os.path.join(base_dir, input_file_name)) | |
hashed_lines = hash_lines(lines=lines, | |
column_positions_to_hash=column_positions_to_hash, | |
contains_column_headers=contains_column_headers) | |
write_lines(file_path=os.path.join(base_dir, output_file_name), lines=hashed_lines) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment