Last active
October 1, 2020 16:37
-
-
Save karamanbk/860c78615d615ae790cc8164f698953c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import libraries | |
from datetime import datetime, timedelta,date | |
import pandas as pd | |
%matplotlib inline | |
from sklearn.metrics import classification_report,confusion_matrix | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import seaborn as sns | |
from __future__ import division | |
from sklearn.cluster import KMeans | |
import plotly.plotly as py | |
import plotly.offline as pyoff | |
import plotly.graph_objs as go | |
import xgboost as xgb | |
from sklearn.model_selection import KFold, cross_val_score, train_test_split | |
import xgboost as xgb | |
#initate plotly | |
pyoff.init_notebook_mode() | |
#read data from csv and redo the data work we done before | |
tx_data = pd.read_csv('data.csv') | |
tx_data['InvoiceDate'] = pd.to_datetime(tx_data['InvoiceDate']) | |
tx_uk = tx_data.query("Country=='United Kingdom'").reset_index(drop=True) | |
#create 3m and 6m dataframes | |
tx_3m = tx_uk[(tx_uk.InvoiceDate < date(2011,6,1)) & (tx_uk.InvoiceDate >= date(2011,3,1))].reset_index(drop=True) | |
tx_6m = tx_uk[(tx_uk.InvoiceDate >= date(2011,6,1)) & (tx_uk.InvoiceDate < date(2011,12,1))].reset_index(drop=True) | |
#create tx_user for assigning clustering | |
tx_user = pd.DataFrame(tx_3m['CustomerID'].unique()) | |
tx_user.columns = ['CustomerID'] | |
#order cluster method | |
def order_cluster(cluster_field_name, target_field_name,df,ascending): | |
new_cluster_field_name = 'new_' + cluster_field_name | |
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index() | |
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True) | |
df_new['index'] = df_new.index | |
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name) | |
df_final = df_final.drop([cluster_field_name],axis=1) | |
df_final = df_final.rename(columns={"index":cluster_field_name}) | |
return df_final | |
#calculate recency score | |
tx_max_purchase = tx_3m.groupby('CustomerID').InvoiceDate.max().reset_index() | |
tx_max_purchase.columns = ['CustomerID','MaxPurchaseDate'] | |
tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days | |
tx_user = pd.merge(tx_user, tx_max_purchase[['CustomerID','Recency']], on='CustomerID') | |
kmeans = KMeans(n_clusters=4) | |
kmeans.fit(tx_user[['Recency']]) | |
tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']]) | |
tx_user = order_cluster('RecencyCluster', 'Recency',tx_user,False) | |
#calcuate frequency score | |
tx_frequency = tx_3m.groupby('CustomerID').InvoiceDate.count().reset_index() | |
tx_frequency.columns = ['CustomerID','Frequency'] | |
tx_user = pd.merge(tx_user, tx_frequency, on='CustomerID') | |
kmeans = KMeans(n_clusters=4) | |
kmeans.fit(tx_user[['Frequency']]) | |
tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']]) | |
tx_user = order_cluster('FrequencyCluster', 'Frequency',tx_user,True) | |
#calcuate revenue score | |
tx_3m['Revenue'] = tx_3m['UnitPrice'] * tx_3m['Quantity'] | |
tx_revenue = tx_3m.groupby('CustomerID').Revenue.sum().reset_index() | |
tx_user = pd.merge(tx_user, tx_revenue, on='CustomerID') | |
kmeans = KMeans(n_clusters=4) | |
kmeans.fit(tx_user[['Revenue']]) | |
tx_user['RevenueCluster'] = kmeans.predict(tx_user[['Revenue']]) | |
tx_user = order_cluster('RevenueCluster', 'Revenue',tx_user,True) | |
#overall scoring | |
tx_user['OverallScore'] = tx_user['RecencyCluster'] + tx_user['FrequencyCluster'] + tx_user['RevenueCluster'] | |
tx_user['Segment'] = 'Low-Value' | |
tx_user.loc[tx_user['OverallScore']>2,'Segment'] = 'Mid-Value' | |
tx_user.loc[tx_user['OverallScore']>4,'Segment'] = 'High-Value' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks, it is very useful. But I am running the code there is an error popup
TypeError: can't compare datetime.datetime to datetime.date