Last active
April 23, 2019 07:30
-
-
Save wmhtet/3f827f04cf2324fc7d2c3199eb7ad635 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
import re | |
import datetime | |
def get_email_list(name, domain_list, total_left): | |
emails_left = total_left.copy() | |
email_list = [] | |
for email in emails_left: | |
if any(email.endswith(substring) for substring in domain_list): | |
total_left.remove(email) | |
email_list.append(email) | |
language = (len(email_list), name, email_list) | |
print("Total left {} : {} {} ".format(len(total_left), language[1], language[0])) | |
return language, total_left | |
def main(): | |
file_path = sys.argv[1] | |
if not os.path.isfile(file_path): | |
print("File path {} Program Exiting..".format( | |
file_path)) | |
sys.exit() | |
no_reviewer_emails = [] | |
file_path2 = "" | |
if len(sys.argv) > 2: | |
file_path2 = sys.argv[2] | |
if os.path.isfile(file_path2): | |
with open(file_path2) as fp2: | |
for line in fp2: | |
line = line.lower().strip('\n') | |
no_reviewer_emails.append(line) | |
no_reviewer_emails = list(dict.fromkeys(no_reviewer_emails)) | |
no_reviewer_emails = sorted(no_reviewer_emails) | |
with open(file_path) as fp: | |
cnt = 0 | |
lines = [] | |
for line in fp: | |
line = line.lower().strip('\n') | |
line = re.sub(r'(?is).*\(', "", line) | |
line = line.replace(")", "").replace(" ", "").replace("â ","") | |
.replace("â ","") | |
# print("{}".format(line)) | |
# record_word_cnt(line.strip().split(' '), bag_of_words) | |
lines.append(line) | |
cnt += 1 | |
total_left = list(dict.fromkeys(lines)) | |
print("Total {} ".format(cnt)) | |
print("Total left {}. Duplicates removed.".format(len(total_left))) | |
total_left = sorted(total_left) | |
# [x for x in a if x not in [2, 3, 7]] | |
total_left = [x for x in total_left if x not in no_reviewer_emails] | |
print("Total left {}. No Reviewers removed.".format(len(total_left))) | |
english_domain = ["@gmail.com", "@hotmail.com", "@yahoo.com", ".co.in", | |
".ie", ".com.sg", ".co.uk", ".com.au", "@icloud.com", | |
"@outlook.com", "@protonmail.com", "@aol.com", | |
"@ymail.com", "@live.com", "@mac.com", "@msn.com", | |
"@yahoo.ca", "@me.com", "@me.com", "@googlemail.com", | |
"@googlemail.com", "@yahoo.com.sg", "@yahoo.ie", | |
"@btinternet.com", "@eircom.net", "@comcast.net"] | |
english, total_left = get_email_list("English", english_domain, total_left) | |
languages = [] | |
chinese_domain = [".hk", ".cn", ".tw", "@qq.com", "@163.com", "@126.com", | |
"@139.com", "@foxmail.com", "@yeah.net", "@aliyun.com", | |
"sina.com", "msa.hinet.net", "halchina.com", | |
"internchina.com"] | |
chinese, total_left = get_email_list("Chinese", chinese_domain, total_left) | |
languages.append(chinese) | |
korean_domain = [".kr", "@naver.com", "@hanmail.net", "@nate.com", | |
"@nate.com", "@daum.net", "@korea.com", "@posco.com"] | |
korean, total_left = get_email_list("Korean", korean_domain, total_left) | |
languages.append(korean) | |
japanese_domain = [".jp", "@ab.wakwak.com", "@nifty.com"] | |
japanese, total_left = get_email_list("Japanese", japanese_domain, | |
total_left) | |
languages.append(japanese) | |
german_domain = [".de", ".at", ".ch", "@gmx.net"] | |
german, total_left = get_email_list("German", german_domain, total_left) | |
languages.append(german) | |
france_domain = [".fr", ".nc", "@laposte.net", "@kedgebs.com"] | |
france, total_left = get_email_list("France", france_domain, total_left) | |
languages.append(france) | |
italian_domain = [".it"] | |
italian, total_left = get_email_list("Italian", italian_domain, total_left) | |
languages.append(italian) | |
russian_domain = [".ru", "@yandex.com", "@ukr.net"] | |
russian, total_left = get_email_list("Russian", russian_domain, total_left) | |
languages.append(russian) | |
portugese_domain = [".br", ".pt"] | |
portugese, total_left = get_email_list("Portugese", portugese_domain, | |
total_left) | |
languages.append(portugese) | |
polish_domain = [".pl"] | |
polish, total_left = get_email_list("Polish", polish_domain, total_left) | |
languages.append(polish) | |
swedish_domain = [".se"] | |
swedish, total_left = get_email_list("Swedish", swedish_domain, total_left) | |
languages.append(swedish) | |
czech_domain = [".cz"] | |
czech, total_left = get_email_list("Czech", czech_domain, total_left) | |
languages.append(czech) | |
croatia_domain = [".hr"] | |
croatia, total_left = get_email_list( | |
"Croatia(Not available) ", croatia_domain, total_left) | |
languages.append(croatia) | |
hebrew_domain = [".il"] | |
hebrew, total_left = get_email_list("Hebrew", hebrew_domain, total_left) | |
languages.append(hebrew) | |
indonesian_domain = [".id", "@mcreasindo.com"] | |
indonesian, total_left = get_email_list("Indonesia", indonesian_domain, | |
total_left) | |
languages.append(indonesian) | |
dutch_domain = [".nl", "@chocoweb.com", "@vierbergen.net"] | |
dutch, total_left = get_email_list("Dutch", dutch_domain, total_left) | |
languages.append(dutch) | |
spanish_domain = [".cl", ".ar", ".es", "@mapp-oea.org"] | |
spanish, total_left = get_email_list("Spanish", spanish_domain, total_left) | |
languages.append(spanish) | |
education_domain = [".edu", ".edu.sg", ".edu.au", "@semesteratsea.org", | |
".edu.my"] | |
education, total_left = get_email_list("Education", education_domain, | |
total_left) | |
# languages.append(education) | |
vietnamese_domain = [".vn"] | |
vietnamese, total_left = get_email_list("Vietnamese", vietnamese_domain, | |
total_left) | |
vietnamese_name = ["nguyen", "hoang", "ngoc", "phuong"] | |
temp_lines = english[2].copy() | |
for line in temp_lines: | |
if any(substring in line for substring in vietnamese_name): | |
english[2].remove(line) | |
vietnamese[2].append(line) | |
english = (len(english[2]), english[1], english[2]) | |
vietnamese = (len(vietnamese[2]), vietnamese[1], vietnamese[2]) | |
print("Total left {} : {} {} : {} {} " | |
.format(len(total_left), english[1], english[0], vietnamese[1], | |
vietnamese[0])) | |
languages.append(vietnamese) | |
temp_lines = total_left.copy() | |
bad_email = [] | |
for line in temp_lines: | |
if "@gmail" in line or "@aol" in line or "@" not in line \ | |
or re.search(".*\..$", line) or re.search(".*\..\..*", line): | |
total_left.remove(line) | |
bad_email.append(line) | |
print("Total left {} : bad {} ".format(len(total_left), len(bad_email))) | |
# print(*lines, sep='\n') | |
# print("========================") | |
# print(*bad_email, sep='\n') | |
print("========================") | |
languages = sorted(languages, key=lambda tup: tup[0], reverse=True) | |
out_file_name = "tripadvisor_{}.txt" \ | |
.format(datetime.datetime.today().strftime('%Y_%m_%d')) | |
with open(out_file_name, "w") as of: | |
cnt = 1 | |
for language in languages: | |
of.write("\n\n==== {}. {} ====\n".format(cnt, language[1])) | |
of.write("\n".join(language[2])) | |
cnt = cnt + 1 | |
if len(english[2]) < 1000: | |
of.write("\n\n==== English ====\n") | |
of.write("\n".join(english[2])) | |
elif len(english[2]) < 2000: | |
of.write("\n\n==== English 0-1000 ====\n") | |
of.write("\n".join(english[2][0:1000])) | |
of.write("\n\n==== English 1001- ====\n") | |
of.write("\n".join(english[2][1001:])) | |
elif len(english[2]) < 3000: | |
of.write("\n\n==== English 0-1000 ====\n") | |
of.write("\n".join(english[2][0:1000])) | |
of.write("\n\n==== English 1001-2000 ====\n") | |
of.write("\n".join(english[2][1001:2000])) | |
of.write("\n\n==== English 2001- ====\n") | |
of.write("\n".join(english[2][2001:])) | |
else: | |
of.write("\n\n==== English 0-1000 ====\n") | |
of.write("\n".join(english[2][0:1000])) | |
of.write("\n\n==== English 1001-2000 ====\n") | |
of.write("\n".join(english[2][1001:2000])) | |
of.write("\n\n==== English 2001-3000 ====\n") | |
of.write("\n".join(english[2][2001:3000])) | |
of.write("\n\n==== English 3001- ====\n") | |
of.write("\n".join(english[2][3001:])) | |
print("Contact IT. There are over 3000 emails!!!") | |
of.write("\n\n==== Leftover ====\n") | |
of.write("\n".join(total_left)) | |
of.write("\n\n==== Bad email ====\n") | |
of.write("\n".join(bad_email)) | |
of.write("\n\n") | |
of.write(" ".join(["@gmail.com", "@hotmail.com", "@yahoo.com", ".co.in", | |
".ie", ".com.sg", ".co.uk", ".com.au", ".de", ".fr", | |
".hk", "co.jp", "@outlook.com", "@aol.com"])) | |
of.write("\n") | |
of.write("\n\n==== Education ====\n") | |
of.write("\n".join(education[2])) | |
# print(out_file_name) | |
# print(datetime.datetime.today().strftime('%Y_%m_%d')) | |
# print(*english[2], sep='\n') | |
# sorted_words = order_bag_of_words(bag_of_words, desc=True) | |
# print("Most frequent 10 words {}".format(sorted_words[:10])) | |
if __name__ == '__main__': | |
main() | |
# domain_list = [".hk", ".cn", ".tw", "@qq.com", "@163.com", "@126.com", | |
# "@139.com", "@foxmail.com", "@yeah.net", "@aliyun.com"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment