Created
December 1, 2019 22:22
-
-
Save cordon-thiago/0ac8e629cdec9692fd9f266e2a72ad9d to your computer and use it in GitHub Desktop.
Extract and transform e-mail domain.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import libraries | |
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
from functions import aux_functions | |
# Extract e-mail domain | |
hardbounce_2['emailDomain'] = hardbounce_2['email'].apply(aux_functions.getEmailDomain) | |
# count by domain | |
group_df = pd.DataFrame(hardbounce_2.groupby('emailDomain').count()['email'].sort_values(ascending=False)).reset_index() | |
# plot top 20 domains | |
sns.set(style="whitegrid") | |
ax = sns.barplot(x="email", y="emailDomain", data=group_df.head(20)) | |
# Classify emails different from gmail and hotmail in others | |
hardbounce_2["emailDomain_cat"] = hardbounce_2["emailDomain"].apply(lambda x: 'others' if (x != 'gmail.com' and x != 'hotmail.com') else x) | |
# count by domain categorized | |
group_df = pd.DataFrame(hardbounce_2.groupby('emailDomain_cat').count()['email'].sort_values(ascending=False)).reset_index() | |
# plot domains | |
sns.set(style="whitegrid") | |
ax = sns.barplot(x="email", y="emailDomain_cat", data=group_df) | |
# Variable Y distribution by domain category | |
aux_functions.freqTable( | |
[hardbounce_2["emailDomain_cat"]] | |
,[hardbounce_2["flgHardBounce_n"]] | |
,True | |
,"index" | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import libraries | |
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
from functions import aux_functions | |
# Split first domain piece | |
hardbounce_2['emailDomainPiece1'] = hardbounce_2['emailDomain'].apply(aux_functions.getPiece1EmailDomain) | |
# count by domain piece | |
group_df = pd.DataFrame(hardbounce_2.groupby('emailDomainPiece1').count()['email'].sort_values(ascending=False)).reset_index() | |
# plot top 20 domains | |
sns.set(style="whitegrid") | |
ax = sns.barplot(x="email", y="emailDomainPiece1", data=group_df.head(20)) | |
# Create category with domains different from .com because this represents the majority of the dataset rows | |
hardbounce_2["emailDomainPiece1"] = hardbounce_2["emailDomainPiece1"].apply(lambda x: 'others' if x != 'com' else x) | |
# Variable Y distribution by domain piece | |
aux_functions.freqTable( | |
[hardbounce_2["emailDomainPiece1"]] | |
,[hardbounce_2["flgHardBounce_n"]] | |
,True | |
,"index" | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import libraries | |
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
from functions import aux_functions | |
# Split second domain piece | |
hardbounce_2['emailDomainPiece2'] = hardbounce_2['emailDomain'].apply(aux_functions.getPiece2EmailDomain) | |
# count by domain piece | |
group_df = pd.DataFrame(hardbounce_2.groupby('emailDomainPiece2').count()['email'].sort_values(ascending=False)).reset_index() | |
# plot top 20 domains | |
sns.set(style="whitegrid") | |
ax = sns.barplot(x="email", y="emailDomainPiece2", data=group_df.head(20)) | |
# Create category with domains different from .br and missing because they represents the majority of the dataset rows | |
hardbounce_2["emailDomainPiece2"] = hardbounce_2["emailDomainPiece2"].apply(lambda x: 'others' if (x != 'missing' and x != 'br') else x) | |
# Variable Y distribution by domain piece | |
aux_functions.freqTable( | |
[hardbounce_2["emailDomainPiece2"]] | |
,[hardbounce_2["flgHardBounce_n"]] | |
,True | |
,"index" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment