Created
July 5, 2021 20:33
-
-
Save brusangues/7a02cf2140da763d8c011df65285e5c1 to your computer and use it in GitHub Desktop.
Python code to search emails in csv, xlsx and json files; split email into name and domain; and apply md5 hash to the name, name in uppercase and name in lowercase; returning them in csv format.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python code to search emails in csv, xlsx and json files; | |
# split email into name and domain; | |
# and apply md5 hash to the name, name in uppercase and name in lowercase; | |
# returning them in csv format. | |
import hashlib | |
import re | |
import csv | |
import glob | |
import pandas as pd | |
def csv_from_excel(filename): | |
# Read and store content | |
# of an excel file | |
read_file = pd.read_excel(filename, engine='openpyxl') | |
# Write the dataframe object | |
# into csv file | |
read_file.to_csv (filename+".csv",index = None, sep=";") | |
EMAIL = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
def md5(text): | |
return hashlib.md5(text.encode()).hexdigest() | |
def process(csv_lines, text): | |
# All mail | |
match = re.findall(EMAIL, text ) | |
if match: | |
for m in match: | |
m_split = m.split("@") | |
if len(m_split)>=2: | |
m_name = m_split[0] | |
line = [m_name, md5(m_name), m] | |
csv_lines.append(line) | |
line = [m_name.lower(), md5(m_name.lower()), m] | |
csv_lines.append(line) | |
line = [m_name.upper(), md5(m_name.upper()), m] | |
csv_lines.append(line) | |
csv_lines = [] | |
for filename in glob.glob('./data/*.xlsx'): | |
csv_from_excel(filename) | |
for filename in glob.glob('./data/*.json')+glob.glob('./data/*.csv'): | |
print(filename) | |
try: | |
with open(filename, encoding = 'utf-8', mode='r') as file: | |
text = file.read() | |
process(csv_lines, text) | |
except: | |
with open(filename, mode='r') as file: | |
text = file.read() | |
process(csv_lines, text) | |
# Remove dupes | |
def removeDupes(csv_lines): | |
csv_lines = sorted(csv_lines, key=lambda x: x[0]) | |
csv_dict = {} | |
for line in csv_lines: | |
csv_dict[line[0]] = line[1:] | |
csv_lines_ = [] | |
for l0,l1 in csv_dict.items(): | |
csv_lines_.append([l0]+l1) | |
csv_lines = csv_lines_ | |
return sorted(csv_lines, key=lambda x: x[0]) | |
csv_lines = removeDupes(csv_lines) | |
print(csv_lines[:5]) | |
print(len(csv_lines)) | |
# open the file in the write mode | |
with open('output.csv', 'w', encoding='utf-8') as f: | |
# create the csv writer | |
writer = csv.writer(f, delimiter=';', lineterminator = '\n') | |
for line in csv_lines: | |
writer.writerow(line) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment