Skip to content

Instantly share code, notes, and snippets.

@wmhtet
Last active April 23, 2019 07:30
Show Gist options
  • Save wmhtet/3f827f04cf2324fc7d2c3199eb7ad635 to your computer and use it in GitHub Desktop.
Save wmhtet/3f827f04cf2324fc7d2c3199eb7ad635 to your computer and use it in GitHub Desktop.
import sys
import os
import re
import datetime
def get_email_list(name, domain_list, total_left):
emails_left = total_left.copy()
email_list = []
for email in emails_left:
if any(email.endswith(substring) for substring in domain_list):
total_left.remove(email)
email_list.append(email)
language = (len(email_list), name, email_list)
print("Total left {} : {} {} ".format(len(total_left), language[1], language[0]))
return language, total_left
def main():
file_path = sys.argv[1]
if not os.path.isfile(file_path):
print("File path {} Program Exiting..".format(
file_path))
sys.exit()
no_reviewer_emails = []
file_path2 = ""
if len(sys.argv) > 2:
file_path2 = sys.argv[2]
if os.path.isfile(file_path2):
with open(file_path2) as fp2:
for line in fp2:
line = line.lower().strip('\n')
no_reviewer_emails.append(line)
no_reviewer_emails = list(dict.fromkeys(no_reviewer_emails))
no_reviewer_emails = sorted(no_reviewer_emails)
with open(file_path) as fp:
cnt = 0
lines = []
for line in fp:
line = line.lower().strip('\n')
line = re.sub(r'(?is).*\(', "", line)
line = line.replace(")", "").replace(" ", "").replace("â ","")
.replace("â ","")
# print("{}".format(line))
# record_word_cnt(line.strip().split(' '), bag_of_words)
lines.append(line)
cnt += 1
total_left = list(dict.fromkeys(lines))
print("Total {} ".format(cnt))
print("Total left {}. Duplicates removed.".format(len(total_left)))
total_left = sorted(total_left)
# [x for x in a if x not in [2, 3, 7]]
total_left = [x for x in total_left if x not in no_reviewer_emails]
print("Total left {}. No Reviewers removed.".format(len(total_left)))
english_domain = ["@gmail.com", "@hotmail.com", "@yahoo.com", ".co.in",
".ie", ".com.sg", ".co.uk", ".com.au", "@icloud.com",
"@outlook.com", "@protonmail.com", "@aol.com",
"@ymail.com", "@live.com", "@mac.com", "@msn.com",
"@yahoo.ca", "@me.com", "@me.com", "@googlemail.com",
"@googlemail.com", "@yahoo.com.sg", "@yahoo.ie",
"@btinternet.com", "@eircom.net", "@comcast.net"]
english, total_left = get_email_list("English", english_domain, total_left)
languages = []
chinese_domain = [".hk", ".cn", ".tw", "@qq.com", "@163.com", "@126.com",
"@139.com", "@foxmail.com", "@yeah.net", "@aliyun.com",
"sina.com", "msa.hinet.net", "halchina.com",
"internchina.com"]
chinese, total_left = get_email_list("Chinese", chinese_domain, total_left)
languages.append(chinese)
korean_domain = [".kr", "@naver.com", "@hanmail.net", "@nate.com",
"@nate.com", "@daum.net", "@korea.com", "@posco.com"]
korean, total_left = get_email_list("Korean", korean_domain, total_left)
languages.append(korean)
japanese_domain = [".jp", "@ab.wakwak.com", "@nifty.com"]
japanese, total_left = get_email_list("Japanese", japanese_domain,
total_left)
languages.append(japanese)
german_domain = [".de", ".at", ".ch", "@gmx.net"]
german, total_left = get_email_list("German", german_domain, total_left)
languages.append(german)
france_domain = [".fr", ".nc", "@laposte.net", "@kedgebs.com"]
france, total_left = get_email_list("France", france_domain, total_left)
languages.append(france)
italian_domain = [".it"]
italian, total_left = get_email_list("Italian", italian_domain, total_left)
languages.append(italian)
russian_domain = [".ru", "@yandex.com", "@ukr.net"]
russian, total_left = get_email_list("Russian", russian_domain, total_left)
languages.append(russian)
portugese_domain = [".br", ".pt"]
portugese, total_left = get_email_list("Portugese", portugese_domain,
total_left)
languages.append(portugese)
polish_domain = [".pl"]
polish, total_left = get_email_list("Polish", polish_domain, total_left)
languages.append(polish)
swedish_domain = [".se"]
swedish, total_left = get_email_list("Swedish", swedish_domain, total_left)
languages.append(swedish)
czech_domain = [".cz"]
czech, total_left = get_email_list("Czech", czech_domain, total_left)
languages.append(czech)
croatia_domain = [".hr"]
croatia, total_left = get_email_list(
"Croatia(Not available) ", croatia_domain, total_left)
languages.append(croatia)
hebrew_domain = [".il"]
hebrew, total_left = get_email_list("Hebrew", hebrew_domain, total_left)
languages.append(hebrew)
indonesian_domain = [".id", "@mcreasindo.com"]
indonesian, total_left = get_email_list("Indonesia", indonesian_domain,
total_left)
languages.append(indonesian)
dutch_domain = [".nl", "@chocoweb.com", "@vierbergen.net"]
dutch, total_left = get_email_list("Dutch", dutch_domain, total_left)
languages.append(dutch)
spanish_domain = [".cl", ".ar", ".es", "@mapp-oea.org"]
spanish, total_left = get_email_list("Spanish", spanish_domain, total_left)
languages.append(spanish)
education_domain = [".edu", ".edu.sg", ".edu.au", "@semesteratsea.org",
".edu.my"]
education, total_left = get_email_list("Education", education_domain,
total_left)
# languages.append(education)
vietnamese_domain = [".vn"]
vietnamese, total_left = get_email_list("Vietnamese", vietnamese_domain,
total_left)
vietnamese_name = ["nguyen", "hoang", "ngoc", "phuong"]
temp_lines = english[2].copy()
for line in temp_lines:
if any(substring in line for substring in vietnamese_name):
english[2].remove(line)
vietnamese[2].append(line)
english = (len(english[2]), english[1], english[2])
vietnamese = (len(vietnamese[2]), vietnamese[1], vietnamese[2])
print("Total left {} : {} {} : {} {} "
.format(len(total_left), english[1], english[0], vietnamese[1],
vietnamese[0]))
languages.append(vietnamese)
temp_lines = total_left.copy()
bad_email = []
for line in temp_lines:
if "@gmail" in line or "@aol" in line or "@" not in line \
or re.search(".*\..$", line) or re.search(".*\..\..*", line):
total_left.remove(line)
bad_email.append(line)
print("Total left {} : bad {} ".format(len(total_left), len(bad_email)))
# print(*lines, sep='\n')
# print("========================")
# print(*bad_email, sep='\n')
print("========================")
languages = sorted(languages, key=lambda tup: tup[0], reverse=True)
out_file_name = "tripadvisor_{}.txt" \
.format(datetime.datetime.today().strftime('%Y_%m_%d'))
with open(out_file_name, "w") as of:
cnt = 1
for language in languages:
of.write("\n\n==== {}. {} ====\n".format(cnt, language[1]))
of.write("\n".join(language[2]))
cnt = cnt + 1
if len(english[2]) < 1000:
of.write("\n\n==== English ====\n")
of.write("\n".join(english[2]))
elif len(english[2]) < 2000:
of.write("\n\n==== English 0-1000 ====\n")
of.write("\n".join(english[2][0:1000]))
of.write("\n\n==== English 1001- ====\n")
of.write("\n".join(english[2][1001:]))
elif len(english[2]) < 3000:
of.write("\n\n==== English 0-1000 ====\n")
of.write("\n".join(english[2][0:1000]))
of.write("\n\n==== English 1001-2000 ====\n")
of.write("\n".join(english[2][1001:2000]))
of.write("\n\n==== English 2001- ====\n")
of.write("\n".join(english[2][2001:]))
else:
of.write("\n\n==== English 0-1000 ====\n")
of.write("\n".join(english[2][0:1000]))
of.write("\n\n==== English 1001-2000 ====\n")
of.write("\n".join(english[2][1001:2000]))
of.write("\n\n==== English 2001-3000 ====\n")
of.write("\n".join(english[2][2001:3000]))
of.write("\n\n==== English 3001- ====\n")
of.write("\n".join(english[2][3001:]))
print("Contact IT. There are over 3000 emails!!!")
of.write("\n\n==== Leftover ====\n")
of.write("\n".join(total_left))
of.write("\n\n==== Bad email ====\n")
of.write("\n".join(bad_email))
of.write("\n\n")
of.write(" ".join(["@gmail.com", "@hotmail.com", "@yahoo.com", ".co.in",
".ie", ".com.sg", ".co.uk", ".com.au", ".de", ".fr",
".hk", "co.jp", "@outlook.com", "@aol.com"]))
of.write("\n")
of.write("\n\n==== Education ====\n")
of.write("\n".join(education[2]))
# print(out_file_name)
# print(datetime.datetime.today().strftime('%Y_%m_%d'))
# print(*english[2], sep='\n')
# sorted_words = order_bag_of_words(bag_of_words, desc=True)
# print("Most frequent 10 words {}".format(sorted_words[:10]))
if __name__ == '__main__':
main()
# domain_list = [".hk", ".cn", ".tw", "@qq.com", "@163.com", "@126.com",
# "@139.com", "@foxmail.com", "@yeah.net", "@aliyun.com"]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment