Skip to content

Instantly share code, notes, and snippets.

@alucard001
Last active January 18, 2017 03:54
Show Gist options
  • Save alucard001/202c0b5a96154898d3b903a6cd04c967 to your computer and use it in GitHub Desktop.
Save alucard001/202c0b5a96154898d3b903a6cd04c967 to your computer and use it in GitHub Desktop.
Python Validate Email example script
# coding: utf-8
import pandas as pd
import numpy as np
from validate_email import validate_email
import sys
# Note: you may encounter error if you don't have pyDNS or pyDNS3 installed.
import DNS
DNS.defaults["server"] = ["8.8.8.8", "8.8.4.4"]
DNS.defaults["server_rotate"] = True
# Default is UDP. But why TCP, because when I wrote this, I have over 300,000 emails, UDP gave me a lot of timeout error
# even if I set the timeout second very low
DNS.defaults["protocol"] = "tcp"
# Timeout second for each checking
DNS.defaults["timeout"] = 2
# This is supposed to be a command line program,
# The format would be like:
#
# python2.7 clean_email.py <your csv file with only 1 email column> <file_name_saved_valid_email> <file_name_saved_invalid_email>
#
if(sys.argv[1] != ''):
path = sys.argv[1]
else:
path = "data/test_count_email.csv"
sys.exit()
email = pd.read_csv(path, header=None, dtype=object)
unique_email = email.astype(str).drop_duplicates()
# Check if email address is correct, and add it to valid_email array
valid_email = []
# vaild_email_address = []
df_email = ''
ok_file = open(sys.argv[2], "w")
fail_file = open(sys.argv[3], "w")
for i, row in unique_email.iterrows():
try:
email = str(row[1]).lower()
is_valid_email = validate_email(email, verify=True, smtp_timeout=2)
if(is_valid_email):
ok_file.write(",".join([email, row[2]]) + "\n")
else:
fail_file.write(",".join([email, row[2]]) + "\n")
except Exception as e:
# print(str(e))
# print("Current email address:" + email)
# email that causes exception will be written as failed one
fail_file.write(",".join([email, row[2]]) + "\n")
pass
# unique_email["valid"] = valid_email
# all_valid_address = unique_email[unique_email["valid"]==True]
# Write CSV file
# all_valid_address.to_csv("valid_email.csv", header=False, encoding="utf-8", index=False)
# This file is used to read all CSV files generated from above script
import pandas as pd
import glob
# Get all valid file content
# merge it into one, and
# create a new CSV file with everything
for filetype in ["valid", "invalid"]:
output = open(filetype + ".csv", "w")
for file in glob.glob("data/email_chunk_*_" + filetype + ".csv"):
for row in open(file, "r"):
output.write(row)
"""
Split large email list into small chunk,
deduplicate it and save it into separated file,
then run a validation script on all those files.
Yes, like doing Hadoop process manually in single machine.
"""
import pandas as pd
import sys
from subprocess import Popen
path = sys.argv[1]
chunksize = int(sys.argv[2])
email = pd.read_csv(path, header=None, dtype=object, chunksize=chunksize)
unique_email = []
for chunk in email:
unique_email.append(chunk.astype(str).drop_duplicates())
chunkEmailLength = len(unique_email)
for i in range(chunkEmailLength):
filename = "data/email_chunk_" + str(i) + ".csv"
unique_email[i].to_csv(path_or_buf = filename, encoding="utf-8", header=False, index=False)
valid_file = "data/email_chunk_" + str(i) + "_valid.csv"
invalid_file = "data/email_chunk_" + str(i) + "_invalid.csv"
Popen(["python2.7", "Clean_Email.py", filename, valid_file, invalid_file])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment