Created
June 2, 2019 08:31
-
-
Save karamanbk/524e438c97f8df1fe46aa32c79611ab4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#get max purchase date for Recency and create a dataframe | |
tx_max_purchase = tx_6m.groupby('CustomerID').InvoiceDate.max().reset_index() | |
tx_max_purchase.columns = ['CustomerID','MaxPurchaseDate'] | |
#find the recency in days and add it to tx_user | |
tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days | |
tx_user = pd.merge(tx_user, tx_max_purchase[['CustomerID','Recency']], on='CustomerID') | |
#plot recency | |
plot_data = [ | |
go.Histogram( | |
x=tx_user['Recency'] | |
) | |
] | |
plot_layout = go.Layout( | |
title='Recency' | |
) | |
fig = go.Figure(data=plot_data, layout=plot_layout) | |
pyoff.iplot(fig) | |
#clustering for Recency | |
kmeans = KMeans(n_clusters=4) | |
kmeans.fit(tx_user[['Recency']]) | |
tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']]) | |
#order cluster method | |
def order_cluster(cluster_field_name, target_field_name,df,ascending): | |
new_cluster_field_name = 'new_' + cluster_field_name | |
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index() | |
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True) | |
df_new['index'] = df_new.index | |
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name) | |
df_final = df_final.drop([cluster_field_name],axis=1) | |
df_final = df_final.rename(columns={"index":cluster_field_name}) | |
return df_final | |
#order recency clusters | |
tx_user = order_cluster('RecencyCluster', 'Recency',tx_user,False) | |
#print cluster characteristics | |
tx_user.groupby('RecencyCluster')['Recency'].describe() | |
#get total purchases for frequency scores | |
tx_frequency = tx_6m.groupby('CustomerID').InvoiceDate.count().reset_index() | |
tx_frequency.columns = ['CustomerID','Frequency'] | |
#add frequency column to tx_user | |
tx_user = pd.merge(tx_user, tx_frequency, on='CustomerID') | |
#plot frequency | |
plot_data = [ | |
go.Histogram( | |
x=tx_user.query('Frequency < 1000')['Frequency'] | |
) | |
] | |
plot_layout = go.Layout( | |
title='Frequency' | |
) | |
fig = go.Figure(data=plot_data, layout=plot_layout) | |
pyoff.iplot(fig) | |
#clustering for frequency | |
kmeans = KMeans(n_clusters=4) | |
kmeans.fit(tx_user[['Frequency']]) | |
tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']]) | |
#order frequency clusters and show the characteristics | |
tx_user = order_cluster('FrequencyCluster', 'Frequency',tx_user,True) | |
tx_user.groupby('FrequencyCluster')['Frequency'].describe() | |
#calculate monetary value, create a dataframe with it | |
tx_6m['Revenue'] = tx_6m['UnitPrice'] * tx_6m['Quantity'] | |
tx_revenue = tx_6m.groupby('CustomerID').Revenue.sum().reset_index() | |
#add Revenue column to tx_user | |
tx_user = pd.merge(tx_user, tx_revenue, on='CustomerID') | |
#plot Revenue | |
plot_data = [ | |
go.Histogram( | |
x=tx_user.query('Revenue < 10000')['Revenue'] | |
) | |
] | |
plot_layout = go.Layout( | |
title='Monetary Value' | |
) | |
fig = go.Figure(data=plot_data, layout=plot_layout) | |
pyoff.iplot(fig) | |
#Revenue clusters | |
kmeans = KMeans(n_clusters=4) | |
kmeans.fit(tx_user[['Revenue']]) | |
tx_user['RevenueCluster'] = kmeans.predict(tx_user[['Revenue']]) | |
#ordering clusters and who the characteristics | |
tx_user = order_cluster('RevenueCluster', 'Revenue',tx_user,True) | |
tx_user.groupby('RevenueCluster')['Revenue'].describe() | |
#building overall segmentation | |
tx_user['OverallScore'] = tx_user['RecencyCluster'] + tx_user['FrequencyCluster'] + tx_user['RevenueCluster'] | |
#assign segment names | |
tx_user['Segment'] = 'Low-Value' | |
tx_user.loc[tx_user['OverallScore']>2,'Segment'] = 'Mid-Value' | |
tx_user.loc[tx_user['OverallScore']>4,'Segment'] = 'High-Value' | |
#plot revenue vs frequency | |
tx_graph = tx_user.query("Revenue < 50000 and Frequency < 2000") | |
plot_data = [ | |
go.Scatter( | |
x=tx_graph.query("Segment == 'Low-Value'")['Frequency'], | |
y=tx_graph.query("Segment == 'Low-Value'")['Revenue'], | |
mode='markers', | |
name='Low', | |
marker= dict(size= 7, | |
line= dict(width=1), | |
color= 'blue', | |
opacity= 0.8 | |
) | |
), | |
go.Scatter( | |
x=tx_graph.query("Segment == 'Mid-Value'")['Frequency'], | |
y=tx_graph.query("Segment == 'Mid-Value'")['Revenue'], | |
mode='markers', | |
name='Mid', | |
marker= dict(size= 9, | |
line= dict(width=1), | |
color= 'green', | |
opacity= 0.5 | |
) | |
), | |
go.Scatter( | |
x=tx_graph.query("Segment == 'High-Value'")['Frequency'], | |
y=tx_graph.query("Segment == 'High-Value'")['Revenue'], | |
mode='markers', | |
name='High', | |
marker= dict(size= 11, | |
line= dict(width=1), | |
color= 'red', | |
opacity= 0.9 | |
) | |
), | |
] | |
plot_layout = go.Layout( | |
yaxis= {'title': "Revenue"}, | |
xaxis= {'title': "Frequency"}, | |
title='Segments' | |
) | |
fig = go.Figure(data=plot_data, layout=plot_layout) | |
pyoff.iplot(fig) | |
#plot revenue vs recency | |
tx_graph = tx_user.query("Revenue < 50000 and Frequency < 2000") | |
plot_data = [ | |
go.Scatter( | |
x=tx_graph.query("Segment == 'Low-Value'")['Recency'], | |
y=tx_graph.query("Segment == 'Low-Value'")['Revenue'], | |
mode='markers', | |
name='Low', | |
marker= dict(size= 7, | |
line= dict(width=1), | |
color= 'blue', | |
opacity= 0.8 | |
) | |
), | |
go.Scatter( | |
x=tx_graph.query("Segment == 'Mid-Value'")['Recency'], | |
y=tx_graph.query("Segment == 'Mid-Value'")['Revenue'], | |
mode='markers', | |
name='Mid', | |
marker= dict(size= 9, | |
line= dict(width=1), | |
color= 'green', | |
opacity= 0.5 | |
) | |
), | |
go.Scatter( | |
x=tx_graph.query("Segment == 'High-Value'")['Recency'], | |
y=tx_graph.query("Segment == 'High-Value'")['Revenue'], | |
mode='markers', | |
name='High', | |
marker= dict(size= 11, | |
line= dict(width=1), | |
color= 'red', | |
opacity= 0.9 | |
) | |
), | |
] | |
plot_layout = go.Layout( | |
yaxis= {'title': "Revenue"}, | |
xaxis= {'title': "Recency"}, | |
title='Segments' | |
) | |
fig = go.Figure(data=plot_data, layout=plot_layout) | |
pyoff.iplot(fig) | |
#plot frequency vs recency | |
tx_graph = tx_user.query("Revenue < 50000 and Frequency < 2000") | |
plot_data = [ | |
go.Scatter( | |
x=tx_graph.query("Segment == 'Low-Value'")['Recency'], | |
y=tx_graph.query("Segment == 'Low-Value'")['Frequency'], | |
mode='markers', | |
name='Low', | |
marker= dict(size= 7, | |
line= dict(width=1), | |
color= 'blue', | |
opacity= 0.8 | |
) | |
), | |
go.Scatter( | |
x=tx_graph.query("Segment == 'Mid-Value'")['Recency'], | |
y=tx_graph.query("Segment == 'Mid-Value'")['Frequency'], | |
mode='markers', | |
name='Mid', | |
marker= dict(size= 9, | |
line= dict(width=1), | |
color= 'green', | |
opacity= 0.5 | |
) | |
), | |
go.Scatter( | |
x=tx_graph.query("Segment == 'High-Value'")['Recency'], | |
y=tx_graph.query("Segment == 'High-Value'")['Frequency'], | |
mode='markers', | |
name='High', | |
marker= dict(size= 11, | |
line= dict(width=1), | |
color= 'red', | |
opacity= 0.9 | |
) | |
), | |
] | |
plot_layout = go.Layout( | |
yaxis= {'title': "Frequency"}, | |
xaxis= {'title': "Recency"}, | |
title='Segments' | |
) | |
fig = go.Figure(data=plot_data, layout=plot_layout) | |
pyoff.iplot(fig) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment