Last active
January 18, 2017 03:54
-
-
Save alucard001/202c0b5a96154898d3b903a6cd04c967 to your computer and use it in GitHub Desktop.
Python Validate Email example script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import pandas as pd | |
import numpy as np | |
from validate_email import validate_email | |
import sys | |
# Note: you may encounter error if you don't have pyDNS or pyDNS3 installed. | |
import DNS | |
DNS.defaults["server"] = ["8.8.8.8", "8.8.4.4"] | |
DNS.defaults["server_rotate"] = True | |
# Default is UDP. But why TCP, because when I wrote this, I have over 300,000 emails, UDP gave me a lot of timeout error | |
# even if I set the timeout second very low | |
DNS.defaults["protocol"] = "tcp" | |
# Timeout second for each checking | |
DNS.defaults["timeout"] = 2 | |
# This is supposed to be a command line program, | |
# The format would be like: | |
# | |
# python2.7 clean_email.py <your csv file with only 1 email column> <file_name_saved_valid_email> <file_name_saved_invalid_email> | |
# | |
if(sys.argv[1] != ''): | |
path = sys.argv[1] | |
else: | |
path = "data/test_count_email.csv" | |
sys.exit() | |
email = pd.read_csv(path, header=None, dtype=object) | |
unique_email = email.astype(str).drop_duplicates() | |
# Check if email address is correct, and add it to valid_email array | |
valid_email = [] | |
# vaild_email_address = [] | |
df_email = '' | |
ok_file = open(sys.argv[2], "w") | |
fail_file = open(sys.argv[3], "w") | |
for i, row in unique_email.iterrows(): | |
try: | |
email = str(row[1]).lower() | |
is_valid_email = validate_email(email, verify=True, smtp_timeout=2) | |
if(is_valid_email): | |
ok_file.write(",".join([email, row[2]]) + "\n") | |
else: | |
fail_file.write(",".join([email, row[2]]) + "\n") | |
except Exception as e: | |
# print(str(e)) | |
# print("Current email address:" + email) | |
# email that causes exception will be written as failed one | |
fail_file.write(",".join([email, row[2]]) + "\n") | |
pass | |
# unique_email["valid"] = valid_email | |
# all_valid_address = unique_email[unique_email["valid"]==True] | |
# Write CSV file | |
# all_valid_address.to_csv("valid_email.csv", header=False, encoding="utf-8", index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This file is used to read all CSV files generated from above script | |
import pandas as pd | |
import glob | |
# Get all valid file content | |
# merge it into one, and | |
# create a new CSV file with everything | |
for filetype in ["valid", "invalid"]: | |
output = open(filetype + ".csv", "w") | |
for file in glob.glob("data/email_chunk_*_" + filetype + ".csv"): | |
for row in open(file, "r"): | |
output.write(row) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Split large email list into small chunk, | |
deduplicate it and save it into separated file, | |
then run a validation script on all those files. | |
Yes, like doing Hadoop process manually in single machine. | |
""" | |
import pandas as pd | |
import sys | |
from subprocess import Popen | |
path = sys.argv[1] | |
chunksize = int(sys.argv[2]) | |
email = pd.read_csv(path, header=None, dtype=object, chunksize=chunksize) | |
unique_email = [] | |
for chunk in email: | |
unique_email.append(chunk.astype(str).drop_duplicates()) | |
chunkEmailLength = len(unique_email) | |
for i in range(chunkEmailLength): | |
filename = "data/email_chunk_" + str(i) + ".csv" | |
unique_email[i].to_csv(path_or_buf = filename, encoding="utf-8", header=False, index=False) | |
valid_file = "data/email_chunk_" + str(i) + "_valid.csv" | |
invalid_file = "data/email_chunk_" + str(i) + "_invalid.csv" | |
Popen(["python2.7", "Clean_Email.py", filename, valid_file, invalid_file]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment