karamanbk · October 1, 2020 16:37 · CaptainNC · Oct 1, 2020
diff --git a/g3_3m_rfm.py b/g3_3m_rfm.py
 #import libraries
 from datetime import datetime, timedelta,date
 import pandas as pd
 %matplotlib inline
 from sklearn.metrics import classification_report,confusion_matrix
 import matplotlib.pyplot as plt
 import numpy as np
 import seaborn as sns
 from __future__ import division
 from sklearn.cluster import KMeans


 import plotly.plotly as py
 import plotly.offline as pyoff
 import plotly.graph_objs as go

 import xgboost as xgb
 from sklearn.model_selection import KFold, cross_val_score, train_test_split

 import xgboost as xgb

 #initate plotly
 pyoff.init_notebook_mode()

 #read data from csv and redo the data work we done before
 tx_data = pd.read_csv('data.csv')
 tx_data['InvoiceDate'] = pd.to_datetime(tx_data['InvoiceDate'])
 tx_uk = tx_data.query("Country=='United Kingdom'").reset_index(drop=True)

 #create 3m and 6m dataframes
 tx_3m = tx_uk[(tx_uk.InvoiceDate < date(2011,6,1)) & (tx_uk.InvoiceDate >= date(2011,3,1))].reset_index(drop=True)
 tx_6m = tx_uk[(tx_uk.InvoiceDate >= date(2011,6,1)) & (tx_uk.InvoiceDate < date(2011,12,1))].reset_index(drop=True)

 #create tx_user for assigning clustering
 tx_user = pd.DataFrame(tx_3m['CustomerID'].unique())
 tx_user.columns = ['CustomerID']

 #order cluster method
 def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final


 #calculate recency score
 tx_max_purchase = tx_3m.groupby('CustomerID').InvoiceDate.max().reset_index()
 tx_max_purchase.columns = ['CustomerID','MaxPurchaseDate']
 tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days
 tx_user = pd.merge(tx_user, tx_max_purchase[['CustomerID','Recency']], on='CustomerID')

 kmeans = KMeans(n_clusters=4)
 kmeans.fit(tx_user[['Recency']])
 tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']])

 tx_user = order_cluster('RecencyCluster', 'Recency',tx_user,False)

 #calcuate frequency score
 tx_frequency = tx_3m.groupby('CustomerID').InvoiceDate.count().reset_index()
 tx_frequency.columns = ['CustomerID','Frequency']
 tx_user = pd.merge(tx_user, tx_frequency, on='CustomerID')

 kmeans = KMeans(n_clusters=4)
 kmeans.fit(tx_user[['Frequency']])
 tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']])

 tx_user = order_cluster('FrequencyCluster', 'Frequency',tx_user,True)

 #calcuate revenue score
 tx_3m['Revenue'] = tx_3m['UnitPrice'] * tx_3m['Quantity']
 tx_revenue = tx_3m.groupby('CustomerID').Revenue.sum().reset_index()
 tx_user = pd.merge(tx_user, tx_revenue, on='CustomerID')

 kmeans = KMeans(n_clusters=4)
 kmeans.fit(tx_user[['Revenue']])
 tx_user['RevenueCluster'] = kmeans.predict(tx_user[['Revenue']])
 tx_user = order_cluster('RevenueCluster', 'Revenue',tx_user,True)


 #overall scoring
 tx_user['OverallScore'] = tx_user['RecencyCluster'] + tx_user['FrequencyCluster'] + tx_user['RevenueCluster']
 tx_user['Segment'] = 'Low-Value'
 tx_user.loc[tx_user['OverallScore']>2,'Segment'] = 'Mid-Value' 
 tx_user.loc[tx_user['OverallScore']>4,'Segment'] = 'High-Value'
	#import libraries
	from datetime import datetime, timedelta,date
	import pandas as pd
	%matplotlib inline
	from sklearn.metrics import classification_report,confusion_matrix
	import matplotlib.pyplot as plt
	import numpy as np
	import seaborn as sns
	from __future__ import division
	from sklearn.cluster import KMeans


	import plotly.plotly as py
	import plotly.offline as pyoff
	import plotly.graph_objs as go

	import xgboost as xgb
	from sklearn.model_selection import KFold, cross_val_score, train_test_split

	import xgboost as xgb

	#initate plotly
	pyoff.init_notebook_mode()

	#read data from csv and redo the data work we done before
	tx_data = pd.read_csv('data.csv')
	tx_data['InvoiceDate'] = pd.to_datetime(tx_data['InvoiceDate'])
	tx_uk = tx_data.query("Country=='United Kingdom'").reset_index(drop=True)

	#create 3m and 6m dataframes
	tx_3m = tx_uk[(tx_uk.InvoiceDate < date(2011,6,1)) & (tx_uk.InvoiceDate >= date(2011,3,1))].reset_index(drop=True)
	tx_6m = tx_uk[(tx_uk.InvoiceDate >= date(2011,6,1)) & (tx_uk.InvoiceDate < date(2011,12,1))].reset_index(drop=True)

	#create tx_user for assigning clustering
	tx_user = pd.DataFrame(tx_3m['CustomerID'].unique())
	tx_user.columns = ['CustomerID']

	#order cluster method
	def order_cluster(cluster_field_name, target_field_name,df,ascending):
	new_cluster_field_name = 'new_' + cluster_field_name
	df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
	df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
	df_new['index'] = df_new.index
	df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
	df_final = df_final.drop([cluster_field_name],axis=1)
	df_final = df_final.rename(columns={"index":cluster_field_name})
	return df_final


	#calculate recency score
	tx_max_purchase = tx_3m.groupby('CustomerID').InvoiceDate.max().reset_index()
	tx_max_purchase.columns = ['CustomerID','MaxPurchaseDate']
	tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days
	tx_user = pd.merge(tx_user, tx_max_purchase[['CustomerID','Recency']], on='CustomerID')

	kmeans = KMeans(n_clusters=4)
	kmeans.fit(tx_user[['Recency']])
	tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']])

	tx_user = order_cluster('RecencyCluster', 'Recency',tx_user,False)

	#calcuate frequency score
	tx_frequency = tx_3m.groupby('CustomerID').InvoiceDate.count().reset_index()
	tx_frequency.columns = ['CustomerID','Frequency']
	tx_user = pd.merge(tx_user, tx_frequency, on='CustomerID')

	kmeans = KMeans(n_clusters=4)
	kmeans.fit(tx_user[['Frequency']])
	tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']])

	tx_user = order_cluster('FrequencyCluster', 'Frequency',tx_user,True)

	#calcuate revenue score
	tx_3m['Revenue'] = tx_3m['UnitPrice'] * tx_3m['Quantity']
	tx_revenue = tx_3m.groupby('CustomerID').Revenue.sum().reset_index()
	tx_user = pd.merge(tx_user, tx_revenue, on='CustomerID')

	kmeans = KMeans(n_clusters=4)
	kmeans.fit(tx_user[['Revenue']])
	tx_user['RevenueCluster'] = kmeans.predict(tx_user[['Revenue']])
	tx_user = order_cluster('RevenueCluster', 'Revenue',tx_user,True)


	#overall scoring
	tx_user['OverallScore'] = tx_user['RecencyCluster'] + tx_user['FrequencyCluster'] + tx_user['RevenueCluster']
	tx_user['Segment'] = 'Low-Value'
	tx_user.loc[tx_user['OverallScore']>2,'Segment'] = 'Mid-Value'
	tx_user.loc[tx_user['OverallScore']>4,'Segment'] = 'High-Value'