karamanbk’s gists

karamanbk / g1_cohort_retention.py

Last active June 3, 2020 05:35

	#create our retention table again with crosstab() and add firs purchase year month view
	tx_retention = pd.crosstab(tx_user_purchase['CustomerID'], tx_user_purchase['InvoiceYearMonth']).reset_index()
	tx_retention = pd.merge(tx_retention,tx_min_purchase[['CustomerID','MinPurchaseYearMonth']],on='CustomerID')
	new_column_names = [ 'm_' + str(column) for column in tx_retention.columns[:-1]]
	new_column_names.append('MinPurchaseYearMonth')
	tx_retention.columns = new_column_names

	#create the array of Retained users for each cohort monthly
	retention_array = []
	for i in range(len(months)):

karamanbk / g1_jpn.ipynb

Created May 3, 2019 17:15

Sorry, something went wrong. Reload?

Sorry, we cannot display this file.

Sorry, this file is invalid so it cannot be displayed.

karamanbk / g1_jpn.ipynb

Last active September 10, 2024 02:58

Sorry, something went wrong. Reload?

Sorry, we cannot display this file.

Sorry, this file is invalid so it cannot be displayed.

karamanbk / g2_g1_recap.py

Created May 3, 2019 19:25

	# import libraries
	from datetime import datetime, timedelta
	import pandas as pd
	%matplotlib inline
	import matplotlib.pyplot as plt
	import numpy as np
	import seaborn as sns
	from __future__ import division

	import plotly.plotly as py

karamanbk / g2_calc_recency.py

Created May 3, 2019 19:27

	#create a generic user dataframe to keep CustomerID and new segmentation scores
	tx_user = pd.DataFrame(tx_data['CustomerID'].unique())
	tx_user.columns = ['CustomerID']

	#get the max purchase date for each customer and create a dataframe with it
	tx_max_purchase = tx_uk.groupby('CustomerID').InvoiceDate.max().reset_index()
	tx_max_purchase.columns = ['CustomerID','MaxPurchaseDate']

	#we take our observation point as the max invoice date in our dataset
	tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days

karamanbk / g2_recency_elbow.py

Last active May 3, 2019 19:52

	from sklearn.cluster import KMeans

	sse={}
	tx_recency = tx_user[['Recency']]
	for k in range(1, 10):
	kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_recency)
	tx_recency["clusters"] = kmeans.labels_
	sse[k] = kmeans.inertia_
	plt.figure()
	plt.plot(list(sse.keys()), list(sse.values()))

karamanbk / g2_recency_cluster.py

Created May 3, 2019 19:48

	#build 4 clusters for recency and add it to dataframe
	kmeans = KMeans(n_clusters=4)
	kmeans.fit(tx_user[['Recency']])
	tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']])

	#function for ordering cluster numbers
	def order_cluster(cluster_field_name, target_field_name,df,ascending):
	new_cluster_field_name = 'new_' + cluster_field_name
	df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
	df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)

karamanbk / g2_frequency.py

Created May 3, 2019 20:04

	#get order counts for each user and create a dataframe with it
	tx_frequency = tx_uk.groupby('CustomerID').InvoiceDate.count().reset_index()
	tx_frequency.columns = ['CustomerID','Frequency']

	#add this data to our main dataframe
	tx_user = pd.merge(tx_user, tx_frequency, on='CustomerID')

	#plot the histogram
	plot_data = [
	go.Histogram(

karamanbk / g2_frequency_cluster.py

Created May 3, 2019 20:07

	#k-means
	kmeans = KMeans(n_clusters=4)
	kmeans.fit(tx_user[['Frequency']])
	tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']])

	#order the frequency cluster
	tx_user = order_cluster('FrequencyCluster', 'Frequency',tx_user,True)

	#see details of each cluster
	tx_user.groupby('FrequencyCluster')['Frequency'].describe()

karamanbk / g2_revenue.py

Created May 3, 2019 20:18

	#calculate revenue for each customer
	tx_uk['Revenue'] = tx_uk['UnitPrice'] * tx_uk['Quantity']
	tx_revenue = tx_uk.groupby('CustomerID').Revenue.sum().reset_index()

	#merge it with our main dataframe
	tx_user = pd.merge(tx_user, tx_revenue, on='CustomerID')

	#plot the histogram
	plot_data = [
	go.Histogram(