BioSciEconomist · February 27, 2020 15:57
diff --git a/dc customer segmentation.py b/dc customer segmentation.py
 # *-----------------------------------------------------------------
 # | PROGRAM NAME: dc customer segmentation
 # | DATE: 5/17/19
 # | CREATED BY: MATT BOGARD 
 # | PROJECT FILE:/Users/amandabogard/Google Drive/Python Scripts           
 # *----------------------------------------------------------------
 # | PURPOSE: code from Data Camp course 
 # *----------------------------------------------------------------


 import pandas as pd
 import datetime as dt


 # ----------------------------------------------
 # Cohort Analysis
 # ---------------------------------------------


 # read data 
 online = pd.read_csv('/Users/amandabogard/Google Drive/Python Scripts/DC Customer Segmentation/chapter_1/online.csv')

 online.head()
 online.columns
 online.info()

 # Convert df['date'] from string to datetime
 # Prepare a format string: time_format
 time_format = '%Y-%m-%d %H:%M'

 # Convert date_list into a datetime object: my_datetimes
 online['InvoiceDate'] = pd.to_datetime(online['InvoiceDate'], format=time_format)  
 online.info()

 ### Assign daily acquisition cohort

 # Define a function that will parse the date
 def get_day(x): return dt.datetime(x.year, x.month, x.day) 

 # Create InvoiceDay column
 online['InvoiceDay'] = online['InvoiceDate'].apply(get_day) 

 # Group by CustomerID and select the InvoiceDay value
 grouping = online.groupby('CustomerID')['InvoiceDay'] 

 # Assign a minimum InvoiceDay value to the dataset
 online['CohortDay'] = grouping.transform('min')


 ### get cohort month

 def get_month(x): return dt.datetime(x.year, x.month, 1) # define function to get month
 online['InvoiceMonth'] = online['InvoiceDate'].apply(get_month) # create InvoiceMonth
 grouping = online.groupby('CustomerID')['InvoiceMonth'] # create grouping by month
 online['CohortMonth'] = grouping.transform('min') # get first month as CohortMonth

 # View the top 5 rows
 print(online.head())

 ### lets look at the data for a customer 12747

 subset = (online['CustomerID'] == 12747) # create subset logic
 tmp = online[subset] # subset data
 tmp = tmp[['CustomerID','InvoiceDay','CohortDay','InvoiceMonth','CohortMonth']] # look only at date columns
 tmp = tmp.sort_values(['InvoiceDay']) # sort by invoice date
 print(tmp) # verify that each person's cohort day is based on thier 1st invoice


 ### Calculate time offset in days - part 1

 def get_date_int(df, column):
    year = df[column].dt.year
    month = df[column].dt.month
    day = df[column].dt.day
    return year, month, day

 # Get the integers for date parts from the `InvoiceDay` column
 invoice_year,invoice_month,invoice_day = get_date_int(online,'InvoiceDay')

 # Get the integers for date parts from the `CohortDay` column
 cohort_year, cohort_month, cohort_day = get_date_int(online, 'CohortDay')


 ### Calculate time offset in days - part 2

 # Calculate difference in years
 years_diff = invoice_year - cohort_year

 # Calculate difference in months
 months_diff = invoice_month - cohort_month

 # Calculate difference in days
 days_diff = invoice_day - cohort_day

 # Extract the difference in days from all previous values
 online['CohortIndex'] = years_diff * 365 + months_diff * 30 + days_diff + 1
 print(online.head())

 ### check our test case

 subset = (online['CustomerID'] == 12747) # create subset logic
 tmp = online[subset] # subset data
 tmp = tmp[['CustomerID','InvoiceDay','CohortDay','CohortIndex']] # look only at date columns
 tmp = tmp.sort_values(['InvoiceDay']) # sort by invoice date
 print(tmp) # verify that each person's cohort day is based on thier 1st invoice


 ### Calculate retention rate from scratch

 # create grouping object
 grouping = online.groupby(['CohortMonth', 'CohortIndex'])
 grouping.first()  #check

 # Count the number of unique values per customer ID
 cohort_data = grouping['CustomerID'].apply(pd.Series.nunique).reset_index()

 # Create a pivot 
 cohort_counts = cohort_data.pivot(index='CohortMonth', columns='CohortIndex', values='CustomerID')

 # Select the first column and store it to cohort_sizes
 cohort_sizes = cohort_counts.iloc[:,0]

 # Divide the cohort count by cohort sizes along the rows
 retention = cohort_counts.divide(cohort_sizes, axis=0)

 ### check

 tmp = online[['CustomerID','CohortMonth','CohortIndex' ] ]# copy  & subset data
 tmp_grp = tmp.groupby(['CohortMonth', 'CohortIndex']) # create grouping object
 tmp_grp.first() # check


 ### Calculate average price

 # Create a groupby object and pass the monthly cohort and cohort index as a list
 grouping = online.groupby(['CohortMonth', 'CohortIndex']) 

 # Calculate the average of the unit price 
 cohort_data = grouping['UnitPrice'].mean()

 # Reset the index of cohort_data
 cohort_data = cohort_data.reset_index()

 # Create a pivot 
 average_quantity = cohort_data.pivot(index='CohortMonth', columns='CohortIndex', values='UnitPrice')
 print(average_quantity.round(1))

 ###  Visualize average quantity metric

 import matplotlib.pyplot as plt

 # Import seaborn package as sns
 import seaborn as sns

 # Initialize an 8 by 6 inches plot figure
 plt.figure(figsize=(8,6))

 # Add a title
 plt.title('Average Spend by Monthly Cohorts')

 # Create the heatmap
 sns.heatmap(data=average_quantity, annot=True, cmap='Blues')
 plt.show()

 # ----------------------------------------------
 # Recency, Frequency, Monetary Value analysis
 # ---------------------------------------------

 # Create a spend quartile with 4 groups - a range between 1 and 5
 spend_quartile = pd.qcut(data['Spend'], q=4, labels=range(1,5))

 # Assign the quartile values to the Spend_Quartile column in data
 data['Spend_Quartile'] = spend_quartile

 # Print data with sorted Spend values
 print(data.sort_values('Spend'))

 ### Calculate Recency deciles (q=10)

 # Store labels from 4 to 1 in a decreasing order
 r_labels = list(range(4, 0, -1))

 # Create a spend quartile with 4 groups and pass the previously created labels 
 recency_quartiles = pd.qcut(data['Recency_Days'], q=4, labels=r_labels)

 # Assign the quartile values to the Recency_Quartile column in `data`
 data['Recency_Quartile'] = recency_quartiles 

 # Print `data` with sorted Recency_Days values
 print(data.sort_values('Recency_Days'))

 ### Calculate RFM values

 # Calculate Recency, Frequency and Monetary values for the online dataset we have used 
 # before - it has been loaded for you with recent 12 months of data. There's a TotalSum 
 # column in the online dataset which has been calculated by multiplying Quantity and 
 # UnitPrice: online['Quantity'] * online['UnitPrice']

 # Also, we have created a snapshot_date variable that you can use to calculate recency. 
 # Feel free to print the online dataset and the snapshot_date into the Console. 
 # The pandas library is loaded as pd, and datetime as dt


 # read data 
 online12m = pd.read_csv('/Users/amandabogard/Google Drive/Python Scripts/DC Customer Segmentation/chapter_2/online12M.csv')

 online12m.info() # check data formats

 # Convert df['date'] from string to datetime
 # Prepare a format string: time_format
 time_format = '%Y-%m-%d %H:%M'

 # Convert date_list into a datetime object
 online12m['InvoiceDate'] = pd.to_datetime(online12m['InvoiceDate'], format=time_format)  
 online.info() # check

 # create total sum metric
 online12m['TotalSum'] =  online12m['Quantity'] * online12m['UnitPrice']

 #  add snapshot date - most recent date in this historical file + 1
 online12m['snapshot_date'] = max(online12m.InvoiceDate) + dt.timedelta(days=1)

 # create snapshot date value
 snapshot_date = max(online12m.InvoiceDate) + dt.timedelta(days=1)

 # Calculate Recency, Frequency and Monetary value for each customer 
 datamart = online12m.groupby(['CustomerID']).agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
    'InvoiceNo': 'count',
    'TotalSum': 'sum'})

 # Rename the columns 
 datamart.rename(columns={'InvoiceDate': 'Recency',
                         'InvoiceNo': 'Frequency',
                         'TotalSum': 'MonetaryValue'}, inplace=True)

 # Print top 5 rows
 print(datamart.head())


 ### Calculate 3 groups for Recency and Frequency

 # Create labels for Recency and Frequency
 r_labels = range(3,0,-1); f_labels = range(1,4)

 # Assign these labels to three equal percentile groups 
 r_groups = pd.qcut(datamart['Recency'], q=3, labels=r_labels)

 # Assign these labels to three equal percentile groups 
 f_groups = pd.qcut(datamart['Frequency'], q=3, labels=f_labels)

 # Create new columns R and F 
 datamart = datamart.assign(R = r_groups.values, F = f_groups.values)


 ### Calculate RFM Score

 # Create labels for MonetaryValue
 m_labels = range(1,4)

 # Assign these labels to three equal percentile groups 
 m_groups = pd.qcut(datamart['MonetaryValue'], q=3, labels=m_labels)

 # Create new column M
 datamart = datamart.assign(M = m_groups.values)

 # Calculate RFM_Score
 datamart['RFM_Score'] = datamart[['R','F','M']].sum(axis=1)
 print(datamart['RFM_Score'].head())


 ### Concatenate RFM quartile values to RFM_Segment

 # define function to concatenate RFM values
 def join_rfm(x): return str(x['R']) + str(x['F']) + str(x['M'])

 datamart['RFM_Segment'] = datamart.apply(join_rfm, axis=1)

 ### explore segments

 # look at largest segments
 datamart.groupby('RFM_Segment').size().sort_values(ascending=False)[:10]

 # Select bottom RFM segment "111" and view top 5 rows

 datamart[datamart['RFM_Segment']=='111'][:5]

 # Summary metrics per RFM Score
 datamart.groupby('RFM_Score').agg({
        'Recency': 'mean',
        'Frequency': 'mean',
        'MonetaryValue': ['mean','count'] }).round(1)

 ### Creating custom segments
    
 # Define rfm_level function
 def rfm_level(df):
    if df['RFM_Score'] >= 10:
        return 'Top'
    elif ((df['RFM_Score'] >= 6) and (df['RFM_Score'] < 10)):
        return 'Middle'
    else:
        return 'Low'

 # Create a new variable RFM_Level
 datamart['RFM_Level'] = datamart.apply(rfm_level, axis=1)

 # Print the header with top 5 rows to the console
 print(datamart.head())


 ### Analyzing custom segments

 # Calculate average values for each RFM_Level, and return a size of each segment 
 rfm_level_agg = datamart.groupby('RFM_Level').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
  
  	# Return the size of each segment
    'MonetaryValue': ['mean', 'count']
 }).round(1)

 # Print the aggregated dataset
 print(rfm_level_agg)


 #--------------------------------------------
 # Data pre-processing for clustering
 #--------------------------------------------

 ### Calculate statistics of variables

 ### examples

 # Print the average values of the variables in the dataset
 print(datamart.mean())

 # Print the standard deviation of the variables in the dataset
 print(datamart.std())

 # Use `describe` function to get key statistics of the dataset
 print(datamart.describe())

 ### Detect skewed variables 

 # import seaborn as sns
 # from matplotlib import pyplot as plt

 sns.distplot(datamart['Recency'])
 plt.show()

 ### examples

 # Plot distribution of var1
 plt.subplot(3, 1, 1); sns.distplot(data['var1'])

 # Plot distribution of var2
 plt.subplot(3, 1, 2); sns.distplot(data['var2'])

 # Plot distribution of var3
 plt.subplot(3, 1, 3); sns.distplot(data['var3'])

 # Show the plot
 plt.show()


 ### manage skewness

 # example:

 # Apply log transformation to var2
 data['var2_log'] = np.log(data['var2'])

 # Apply log transformation to var3
 data['var3_log'] = np.log(data['var3'])

 # Create a subplot of the distribution of var2_log
 plt.subplot(2, 1, 1); sns.distplot(data['var2_log'])

 # Create a subplot of the distribution of var3_log
 plt.subplot(2, 1, 2); sns.distplot(data['var3_log'])

 # Show the plot
 plt.show()


 ### Center and scale manually

 # example:

 # Center the data by subtracting average values from each entry
 data_centered = data - data.mean()

 # Scale the data by dividing each entry by standard deviation
 data_scaled = data / data.std()

 # Normalize the data by applying both centering and scaling
 data_normalized = (data - data.mean()) / data.std()

 # Print summary statistics to make sure average is zero and standard deviation is one
 print(data_normalized.describe().round(2))


 ###  Center and scale with StandardScaler()

 # example:

 # Initialize a scaler
 scaler = StandardScaler()

 # Fit the scaler
 scaler.fit(data)

 # Scale and center the data
 data_normalized = scaler.transform(data)

 # Create a pandas DataFrame
 data_normalized = pd.DataFrame(data_normalized, index=data.index, columns=data.columns)

 # Print summary statistics
 print(data_normalized.describe().round(2))

 ###  Visualize RFM distributions

 # read data 
 datamart_rfm = pd.read_csv('/Users/amandabogard/Google Drive/Python Scripts/DC Customer Segmentation/chapter_3/rfm_datamart.csv')

 datamart_rfm .info() # check data formats

 datamart_rfm .describe()


 # Plot recency distribution
 plt.subplot(3, 1, 1); sns.distplot(datamart_rfm['Recency'])

 # Plot frequency distribution
 plt.subplot(3, 1, 2); sns.distplot(datamart_rfm['Frequency'])

 # Plot monetary value distribution
 plt.subplot(3, 1, 3); sns.distplot(datamart_rfm['MonetaryValue'])

 # Show the plot
 plt.show()


 ### Pre-process RFM data

 import numpy as np

 from sklearn.preprocessing import StandardScaler

 # Unskew the data
 datamart_log = np.log(datamart_rfm)

 # Initialize a standard scaler and fit it
 scaler = StandardScaler()
 scaler.fit(datamart_log)

 # Scale and center the data
 datamart_normalized = scaler.transform(datamart_log)

 # Create a pandas DataFrame
 datamart_normalized = pd.DataFrame(data=datamart_normalized, index=datamart_rfm.index, columns=datamart_rfm.columns)


 ###  Visualize the normalized variables

 # Plot recency distribution
 plt.subplot(3, 1, 1); sns.distplot(datamart_normalized['Recency'])

 # Plot frequency distribution
 plt.subplot(3, 1, 2); sns.distplot(datamart_normalized['Frequency'])

 # Plot monetary value distribution
 plt.subplot(3, 1, 3); sns.distplot(datamart_normalized['MonetaryValue'])

 # Show the plot
 plt.show()



 #-----------------------------------------
 # Customer Segmentation with K-means
 #-----------------------------------------

 # Import KMeans 
 from sklearn.cluster import KMeans

 # Initialize KMeans
 kmeans = KMeans(n_clusters=3, random_state=1) 

 # Fit k-means clustering on the normalized data set
 kmeans.fit(datamart_normalized)

 # Extract cluster labels
 cluster_labels = kmeans.labels_


 ### Assign labels to raw data

 # Create a DataFrame by adding a new cluster label column
 datamart_rfm_k3 = datamart_rfm.assign(Cluster=cluster_labels)

 # Group the data by cluster
 grouped = datamart_rfm_k3.groupby(['Cluster'])

 # Calculate average RFM values and segment sizes per cluster value
 grouped.agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'MonetaryValue': ['mean', 'count']
  }).round(1)


 ###  Calculate sum of squared errors
    
 # example:
    
 # Fit KMeans and calculate SSE for each k
 for k in range(1, 21):
  
    # Initialize KMeans with k clusters
    kmeans = KMeans(n_clusters=k, random_state=1)
    
    # Fit KMeans on the normalized dataset
    kmeans.fit(data_normalized)
    
    # Assign sum of squared distances to k element of dictionary
    sse[k] = kmeans.inertia_ 
    

 ### Plot sum of squared errors
    
 # example:
    
 # Add the plot title "The Elbow Method"
 plt.title('The Elbow Method')

 # Add X-axis label "k"
 plt.xlabel('k')

 # Add Y-axis label "SSE"
 plt.ylabel('SSE')

 # Plot SSE values for each key in the dictionary
 sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
 plt.show()


 ### Prepare data for the snake plot

 # example:

 # Melt the normalized dataset and reset the index
 datamart_melt = pd.melt(datamart_normalized.reset_index(), 
 # Assign CustomerID and Cluster as ID variables                  
                    id_vars=['CustomerID', 'Cluster'],
 # Assign RFM values as value variables
                    value_vars=['Recency', 'Frequency', 'MonetaryValue'],                         
 # Name the variable and value
                    var_name='Metric', value_name='Value'
 					)

 ### Visualize snake plot

 # Add the plot title
 plt.title('Snake plot of normalized variables')

 # Add the x axis label
 plt.xlabel('Metric')

 # Add the y axis label
 plt.ylabel('Value')

 # Plot a line for each value of the cluster variable
 sns.lineplot(data =datamart_melt, x='Metric', y='Value', hue='Cluster')
 plt.show()


 ### Calculate relative importance

 # example:

 # Calculate average RFM values for each cluster
 cluster_avg = datamart_rfm_k3.groupby(['Cluster']).mean() 

 # Calculate average RFM values for the total customer population
 population_avg = datamart_rfm.mean()

 # Calculate relative importance of cluster's attribute value compared to population
 relative_imp = cluster_avg / population_avg - 1

 # Print relative importance scores rounded to 2 decimals
 print(relative_imp.round(2))


 ### Relative importance heatmap

 # Initialize a plot with a figure size of 8 by 2 inches 
 plt.figure(figsize=(8,2))

 # Add the plot title
 plt.title('Relative importance of attributes')

 # Plot the heatmap
 sns.heatmap(data=relative_imp, annot=True,fmt ='.2f',cmap ='RdYlGn')
 plt.show()


 # ----    End to end customer segmentation solution  ----- #


 datamart_rfmt = pd.read_csv('/Users/amandabogard/Google Drive/Python Scripts/DC Customer Segmentation/chapter_4/datamart_rfmt.csv')

 ### Pre-process data

 # Import StandardScaler 
 from sklearn.preprocessing import StandardScaler

 # Apply log transformation
 datamart_rfmt_log = np.log(datamart_rfmt)

 # Initialize StandardScaler and fit it 
 scaler = StandardScaler(); scaler.fit(datamart_rfmt_log)

 # Transform and store the scaled data as datamart_rfmt_normalized
 datamart_rfmt_normalized = scaler.transform(datamart_rfmt_log)


 ### Calculate and plot sum of squared errors

 # Fit KMeans and calculate SSE for each k between 1 and 10
 for k in range(1, 11):
  
    # Initialize KMeans with k clusters and fit it 
    kmeans = KMeans(n_clusters=k, random_state=1).fit(datamart_rfmt_normalized)
    
    # Assign sum of squared distances to k element of the sse dictionary
    sse[k] = kmeans.inertia_ 

 # Add the plot title, x and y axis labels
 plt.title('The Elbow Method'); plt.xlabel('k'); plt.ylabel('SSE')

 # Plot SSE values for each k stored as keys in the dictionary
 sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
 plt.show()


 ### Build 4-cluster solution

 # Import KMeans 
 from sklearn.cluster import KMeans

 # Initialize KMeans
 kmeans = KMeans(n_clusters=4,random_state=1) 

 # Fit k-means clustering on the normalized data set
 kmeans.fit(datamart_rfmt_normalized)

 # Extract cluster labels
 cluster_labels = kmeans.labels_

 ### Analyze the segments

 # Create a new DataFrame by adding a cluster label column to datamart_rfmt
 datamart_rfmt_k4 = datamart_rfmt.assign(Cluster=cluster_labels)

 # Group by cluster
 grouped = datamart_rfmt_k4.groupby(['Cluster'])

 # Calculate average RFMT values and segment sizes for each cluster
 grouped.agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'MonetaryValue': 'mean',
    'Tenure': ['mean', 'count']
  }).round(1)
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: dc customer segmentation
	# \| DATE: 5/17/19
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:/Users/amandabogard/Google Drive/Python Scripts
	# *----------------------------------------------------------------
	# \| PURPOSE: code from Data Camp course
	# *----------------------------------------------------------------


	import pandas as pd
	import datetime as dt


	# ----------------------------------------------
	# Cohort Analysis
	# ---------------------------------------------


	# read data
	online = pd.read_csv('/Users/amandabogard/Google Drive/Python Scripts/DC Customer Segmentation/chapter_1/online.csv')

	online.head()
	online.columns
	online.info()

	# Convert df['date'] from string to datetime
	# Prepare a format string: time_format
	time_format = '%Y-%m-%d %H:%M'

	# Convert date_list into a datetime object: my_datetimes
	online['InvoiceDate'] = pd.to_datetime(online['InvoiceDate'], format=time_format)
	online.info()

	### Assign daily acquisition cohort

	# Define a function that will parse the date
	def get_day(x): return dt.datetime(x.year, x.month, x.day)

	# Create InvoiceDay column
	online['InvoiceDay'] = online['InvoiceDate'].apply(get_day)

	# Group by CustomerID and select the InvoiceDay value
	grouping = online.groupby('CustomerID')['InvoiceDay']

	# Assign a minimum InvoiceDay value to the dataset
	online['CohortDay'] = grouping.transform('min')


	### get cohort month

	def get_month(x): return dt.datetime(x.year, x.month, 1) # define function to get month
	online['InvoiceMonth'] = online['InvoiceDate'].apply(get_month) # create InvoiceMonth
	grouping = online.groupby('CustomerID')['InvoiceMonth'] # create grouping by month
	online['CohortMonth'] = grouping.transform('min') # get first month as CohortMonth

	# View the top 5 rows
	print(online.head())

	### lets look at the data for a customer 12747

	subset = (online['CustomerID'] == 12747) # create subset logic
	tmp = online[subset] # subset data
	tmp = tmp[['CustomerID','InvoiceDay','CohortDay','InvoiceMonth','CohortMonth']] # look only at date columns
	tmp = tmp.sort_values(['InvoiceDay']) # sort by invoice date
	print(tmp) # verify that each person's cohort day is based on thier 1st invoice


	### Calculate time offset in days - part 1

	def get_date_int(df, column):
	year = df[column].dt.year
	month = df[column].dt.month
	day = df[column].dt.day
	return year, month, day

	# Get the integers for date parts from the `InvoiceDay` column
	invoice_year,invoice_month,invoice_day = get_date_int(online,'InvoiceDay')

	# Get the integers for date parts from the `CohortDay` column
	cohort_year, cohort_month, cohort_day = get_date_int(online, 'CohortDay')


	### Calculate time offset in days - part 2

	# Calculate difference in years
	years_diff = invoice_year - cohort_year

	# Calculate difference in months
	months_diff = invoice_month - cohort_month

	# Calculate difference in days
	days_diff = invoice_day - cohort_day

	# Extract the difference in days from all previous values
	online['CohortIndex'] = years_diff * 365 + months_diff * 30 + days_diff + 1
	print(online.head())

	### check our test case

	subset = (online['CustomerID'] == 12747) # create subset logic
	tmp = online[subset] # subset data
	tmp = tmp[['CustomerID','InvoiceDay','CohortDay','CohortIndex']] # look only at date columns
	tmp = tmp.sort_values(['InvoiceDay']) # sort by invoice date
	print(tmp) # verify that each person's cohort day is based on thier 1st invoice


	### Calculate retention rate from scratch

	# create grouping object
	grouping = online.groupby(['CohortMonth', 'CohortIndex'])
	grouping.first() #check

	# Count the number of unique values per customer ID
	cohort_data = grouping['CustomerID'].apply(pd.Series.nunique).reset_index()

	# Create a pivot
	cohort_counts = cohort_data.pivot(index='CohortMonth', columns='CohortIndex', values='CustomerID')

	# Select the first column and store it to cohort_sizes
	cohort_sizes = cohort_counts.iloc[:,0]

	# Divide the cohort count by cohort sizes along the rows
	retention = cohort_counts.divide(cohort_sizes, axis=0)

	### check

	tmp = online[['CustomerID','CohortMonth','CohortIndex' ] ]# copy & subset data
	tmp_grp = tmp.groupby(['CohortMonth', 'CohortIndex']) # create grouping object
	tmp_grp.first() # check


	### Calculate average price

	# Create a groupby object and pass the monthly cohort and cohort index as a list
	grouping = online.groupby(['CohortMonth', 'CohortIndex'])

	# Calculate the average of the unit price
	cohort_data = grouping['UnitPrice'].mean()

	# Reset the index of cohort_data
	cohort_data = cohort_data.reset_index()

	# Create a pivot
	average_quantity = cohort_data.pivot(index='CohortMonth', columns='CohortIndex', values='UnitPrice')
	print(average_quantity.round(1))

	### Visualize average quantity metric

	import matplotlib.pyplot as plt

	# Import seaborn package as sns
	import seaborn as sns

	# Initialize an 8 by 6 inches plot figure
	plt.figure(figsize=(8,6))

	# Add a title
	plt.title('Average Spend by Monthly Cohorts')

	# Create the heatmap
	sns.heatmap(data=average_quantity, annot=True, cmap='Blues')
	plt.show()

	# ----------------------------------------------
	# Recency, Frequency, Monetary Value analysis
	# ---------------------------------------------

	# Create a spend quartile with 4 groups - a range between 1 and 5
	spend_quartile = pd.qcut(data['Spend'], q=4, labels=range(1,5))

	# Assign the quartile values to the Spend_Quartile column in data
	data['Spend_Quartile'] = spend_quartile

	# Print data with sorted Spend values
	print(data.sort_values('Spend'))

	### Calculate Recency deciles (q=10)

	# Store labels from 4 to 1 in a decreasing order
	r_labels = list(range(4, 0, -1))

	# Create a spend quartile with 4 groups and pass the previously created labels
	recency_quartiles = pd.qcut(data['Recency_Days'], q=4, labels=r_labels)

	# Assign the quartile values to the Recency_Quartile column in `data`
	data['Recency_Quartile'] = recency_quartiles

	# Print `data` with sorted Recency_Days values
	print(data.sort_values('Recency_Days'))

	### Calculate RFM values

	# Calculate Recency, Frequency and Monetary values for the online dataset we have used
	# before - it has been loaded for you with recent 12 months of data. There's a TotalSum
	# column in the online dataset which has been calculated by multiplying Quantity and
	# UnitPrice: online['Quantity'] * online['UnitPrice']

	# Also, we have created a snapshot_date variable that you can use to calculate recency.
	# Feel free to print the online dataset and the snapshot_date into the Console.
	# The pandas library is loaded as pd, and datetime as dt


	# read data
	online12m = pd.read_csv('/Users/amandabogard/Google Drive/Python Scripts/DC Customer Segmentation/chapter_2/online12M.csv')

	online12m.info() # check data formats

	# Convert df['date'] from string to datetime
	# Prepare a format string: time_format
	time_format = '%Y-%m-%d %H:%M'

	# Convert date_list into a datetime object
	online12m['InvoiceDate'] = pd.to_datetime(online12m['InvoiceDate'], format=time_format)
	online.info() # check

	# create total sum metric
	online12m['TotalSum'] = online12m['Quantity'] * online12m['UnitPrice']

	# add snapshot date - most recent date in this historical file + 1
	online12m['snapshot_date'] = max(online12m.InvoiceDate) + dt.timedelta(days=1)

	# create snapshot date value
	snapshot_date = max(online12m.InvoiceDate) + dt.timedelta(days=1)

	# Calculate Recency, Frequency and Monetary value for each customer
	datamart = online12m.groupby(['CustomerID']).agg({
	'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
	'InvoiceNo': 'count',
	'TotalSum': 'sum'})

	# Rename the columns
	datamart.rename(columns={'InvoiceDate': 'Recency',
	'InvoiceNo': 'Frequency',
	'TotalSum': 'MonetaryValue'}, inplace=True)

	# Print top 5 rows
	print(datamart.head())


	### Calculate 3 groups for Recency and Frequency

	# Create labels for Recency and Frequency
	r_labels = range(3,0,-1); f_labels = range(1,4)

	# Assign these labels to three equal percentile groups
	r_groups = pd.qcut(datamart['Recency'], q=3, labels=r_labels)

	# Assign these labels to three equal percentile groups
	f_groups = pd.qcut(datamart['Frequency'], q=3, labels=f_labels)

	# Create new columns R and F
	datamart = datamart.assign(R = r_groups.values, F = f_groups.values)


	### Calculate RFM Score

	# Create labels for MonetaryValue
	m_labels = range(1,4)

	# Assign these labels to three equal percentile groups
	m_groups = pd.qcut(datamart['MonetaryValue'], q=3, labels=m_labels)

	# Create new column M
	datamart = datamart.assign(M = m_groups.values)

	# Calculate RFM_Score
	datamart['RFM_Score'] = datamart[['R','F','M']].sum(axis=1)
	print(datamart['RFM_Score'].head())


	### Concatenate RFM quartile values to RFM_Segment

	# define function to concatenate RFM values
	def join_rfm(x): return str(x['R']) + str(x['F']) + str(x['M'])

	datamart['RFM_Segment'] = datamart.apply(join_rfm, axis=1)

	### explore segments

	# look at largest segments
	datamart.groupby('RFM_Segment').size().sort_values(ascending=False)[:10]

	# Select bottom RFM segment "111" and view top 5 rows

	datamart[datamart['RFM_Segment']=='111'][:5]

	# Summary metrics per RFM Score
	datamart.groupby('RFM_Score').agg({
	'Recency': 'mean',
	'Frequency': 'mean',
	'MonetaryValue': ['mean','count'] }).round(1)

	### Creating custom segments

	# Define rfm_level function
	def rfm_level(df):
	if df['RFM_Score'] >= 10:
	return 'Top'
	elif ((df['RFM_Score'] >= 6) and (df['RFM_Score'] < 10)):
	return 'Middle'
	else:
	return 'Low'

	# Create a new variable RFM_Level
	datamart['RFM_Level'] = datamart.apply(rfm_level, axis=1)

	# Print the header with top 5 rows to the console
	print(datamart.head())


	### Analyzing custom segments

	# Calculate average values for each RFM_Level, and return a size of each segment
	rfm_level_agg = datamart.groupby('RFM_Level').agg({
	'Recency': 'mean',
	'Frequency': 'mean',

	# Return the size of each segment
	'MonetaryValue': ['mean', 'count']
	}).round(1)

	# Print the aggregated dataset
	print(rfm_level_agg)


	#--------------------------------------------
	# Data pre-processing for clustering
	#--------------------------------------------

	### Calculate statistics of variables

	### examples

	# Print the average values of the variables in the dataset
	print(datamart.mean())

	# Print the standard deviation of the variables in the dataset
	print(datamart.std())

	# Use `describe` function to get key statistics of the dataset
	print(datamart.describe())

	### Detect skewed variables

	# import seaborn as sns
	# from matplotlib import pyplot as plt

	sns.distplot(datamart['Recency'])
	plt.show()

	### examples

	# Plot distribution of var1
	plt.subplot(3, 1, 1); sns.distplot(data['var1'])

	# Plot distribution of var2
	plt.subplot(3, 1, 2); sns.distplot(data['var2'])

	# Plot distribution of var3
	plt.subplot(3, 1, 3); sns.distplot(data['var3'])

	# Show the plot
	plt.show()


	### manage skewness

	# example:

	# Apply log transformation to var2
	data['var2_log'] = np.log(data['var2'])

	# Apply log transformation to var3
	data['var3_log'] = np.log(data['var3'])

	# Create a subplot of the distribution of var2_log
	plt.subplot(2, 1, 1); sns.distplot(data['var2_log'])

	# Create a subplot of the distribution of var3_log
	plt.subplot(2, 1, 2); sns.distplot(data['var3_log'])

	# Show the plot
	plt.show()


	### Center and scale manually

	# example:

	# Center the data by subtracting average values from each entry
	data_centered = data - data.mean()

	# Scale the data by dividing each entry by standard deviation
	data_scaled = data / data.std()

	# Normalize the data by applying both centering and scaling
	data_normalized = (data - data.mean()) / data.std()

	# Print summary statistics to make sure average is zero and standard deviation is one
	print(data_normalized.describe().round(2))


	### Center and scale with StandardScaler()

	# example:

	# Initialize a scaler
	scaler = StandardScaler()

	# Fit the scaler
	scaler.fit(data)

	# Scale and center the data
	data_normalized = scaler.transform(data)

	# Create a pandas DataFrame
	data_normalized = pd.DataFrame(data_normalized, index=data.index, columns=data.columns)

	# Print summary statistics
	print(data_normalized.describe().round(2))

	### Visualize RFM distributions

	# read data
	datamart_rfm = pd.read_csv('/Users/amandabogard/Google Drive/Python Scripts/DC Customer Segmentation/chapter_3/rfm_datamart.csv')

	datamart_rfm .info() # check data formats

	datamart_rfm .describe()


	# Plot recency distribution
	plt.subplot(3, 1, 1); sns.distplot(datamart_rfm['Recency'])

	# Plot frequency distribution
	plt.subplot(3, 1, 2); sns.distplot(datamart_rfm['Frequency'])

	# Plot monetary value distribution
	plt.subplot(3, 1, 3); sns.distplot(datamart_rfm['MonetaryValue'])

	# Show the plot
	plt.show()


	### Pre-process RFM data

	import numpy as np

	from sklearn.preprocessing import StandardScaler

	# Unskew the data
	datamart_log = np.log(datamart_rfm)

	# Initialize a standard scaler and fit it
	scaler = StandardScaler()
	scaler.fit(datamart_log)

	# Scale and center the data
	datamart_normalized = scaler.transform(datamart_log)

	# Create a pandas DataFrame
	datamart_normalized = pd.DataFrame(data=datamart_normalized, index=datamart_rfm.index, columns=datamart_rfm.columns)


	### Visualize the normalized variables

	# Plot recency distribution
	plt.subplot(3, 1, 1); sns.distplot(datamart_normalized['Recency'])

	# Plot frequency distribution
	plt.subplot(3, 1, 2); sns.distplot(datamart_normalized['Frequency'])

	# Plot monetary value distribution
	plt.subplot(3, 1, 3); sns.distplot(datamart_normalized['MonetaryValue'])

	# Show the plot
	plt.show()



	#-----------------------------------------
	# Customer Segmentation with K-means
	#-----------------------------------------

	# Import KMeans
	from sklearn.cluster import KMeans

	# Initialize KMeans
	kmeans = KMeans(n_clusters=3, random_state=1)

	# Fit k-means clustering on the normalized data set
	kmeans.fit(datamart_normalized)

	# Extract cluster labels
	cluster_labels = kmeans.labels_


	### Assign labels to raw data

	# Create a DataFrame by adding a new cluster label column
	datamart_rfm_k3 = datamart_rfm.assign(Cluster=cluster_labels)

	# Group the data by cluster
	grouped = datamart_rfm_k3.groupby(['Cluster'])

	# Calculate average RFM values and segment sizes per cluster value
	grouped.agg({
	'Recency': 'mean',
	'Frequency': 'mean',
	'MonetaryValue': ['mean', 'count']
	}).round(1)


	### Calculate sum of squared errors

	# example:

	# Fit KMeans and calculate SSE for each k
	for k in range(1, 21):

	# Initialize KMeans with k clusters
	kmeans = KMeans(n_clusters=k, random_state=1)

	# Fit KMeans on the normalized dataset
	kmeans.fit(data_normalized)

	# Assign sum of squared distances to k element of dictionary
	sse[k] = kmeans.inertia_


	### Plot sum of squared errors

	# example:

	# Add the plot title "The Elbow Method"
	plt.title('The Elbow Method')

	# Add X-axis label "k"
	plt.xlabel('k')

	# Add Y-axis label "SSE"
	plt.ylabel('SSE')

	# Plot SSE values for each key in the dictionary
	sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
	plt.show()


	### Prepare data for the snake plot

	# example:

	# Melt the normalized dataset and reset the index
	datamart_melt = pd.melt(datamart_normalized.reset_index(),
	# Assign CustomerID and Cluster as ID variables
	id_vars=['CustomerID', 'Cluster'],
	# Assign RFM values as value variables
	value_vars=['Recency', 'Frequency', 'MonetaryValue'],
	# Name the variable and value
	var_name='Metric', value_name='Value'
	)

	### Visualize snake plot

	# Add the plot title
	plt.title('Snake plot of normalized variables')

	# Add the x axis label
	plt.xlabel('Metric')

	# Add the y axis label
	plt.ylabel('Value')

	# Plot a line for each value of the cluster variable
	sns.lineplot(data =datamart_melt, x='Metric', y='Value', hue='Cluster')
	plt.show()


	### Calculate relative importance

	# example:

	# Calculate average RFM values for each cluster
	cluster_avg = datamart_rfm_k3.groupby(['Cluster']).mean()

	# Calculate average RFM values for the total customer population
	population_avg = datamart_rfm.mean()

	# Calculate relative importance of cluster's attribute value compared to population
	relative_imp = cluster_avg / population_avg - 1

	# Print relative importance scores rounded to 2 decimals
	print(relative_imp.round(2))


	### Relative importance heatmap

	# Initialize a plot with a figure size of 8 by 2 inches
	plt.figure(figsize=(8,2))

	# Add the plot title
	plt.title('Relative importance of attributes')

	# Plot the heatmap
	sns.heatmap(data=relative_imp, annot=True,fmt ='.2f',cmap ='RdYlGn')
	plt.show()


	# ---- End to end customer segmentation solution ----- #


	datamart_rfmt = pd.read_csv('/Users/amandabogard/Google Drive/Python Scripts/DC Customer Segmentation/chapter_4/datamart_rfmt.csv')

	### Pre-process data

	# Import StandardScaler
	from sklearn.preprocessing import StandardScaler

	# Apply log transformation
	datamart_rfmt_log = np.log(datamart_rfmt)

	# Initialize StandardScaler and fit it
	scaler = StandardScaler(); scaler.fit(datamart_rfmt_log)

	# Transform and store the scaled data as datamart_rfmt_normalized
	datamart_rfmt_normalized = scaler.transform(datamart_rfmt_log)


	### Calculate and plot sum of squared errors

	# Fit KMeans and calculate SSE for each k between 1 and 10
	for k in range(1, 11):

	# Initialize KMeans with k clusters and fit it
	kmeans = KMeans(n_clusters=k, random_state=1).fit(datamart_rfmt_normalized)

	# Assign sum of squared distances to k element of the sse dictionary
	sse[k] = kmeans.inertia_

	# Add the plot title, x and y axis labels
	plt.title('The Elbow Method'); plt.xlabel('k'); plt.ylabel('SSE')

	# Plot SSE values for each k stored as keys in the dictionary
	sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
	plt.show()


	### Build 4-cluster solution

	# Import KMeans
	from sklearn.cluster import KMeans

	# Initialize KMeans
	kmeans = KMeans(n_clusters=4,random_state=1)

	# Fit k-means clustering on the normalized data set
	kmeans.fit(datamart_rfmt_normalized)

	# Extract cluster labels
	cluster_labels = kmeans.labels_

	### Analyze the segments

	# Create a new DataFrame by adding a cluster label column to datamart_rfmt
	datamart_rfmt_k4 = datamart_rfmt.assign(Cluster=cluster_labels)

	# Group by cluster
	grouped = datamart_rfmt_k4.groupby(['Cluster'])

	# Calculate average RFMT values and segment sizes for each cluster
	grouped.agg({
	'Recency': 'mean',
	'Frequency': 'mean',
	'MonetaryValue': 'mean',
	'Tenure': ['mean', 'count']
	}).round(1)