Created
April 6, 2020 18:01
-
-
Save nicolasesnis/595d34c3c7dbca2b3419332304954433 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import seaborn as sns | |
import pandas as pd | |
import plotly.graph_objects as go | |
import chart_studio.plotly as py | |
import plotly | |
# Path to the raw data: https://gist.github.com/nicolasesnis/eb3b35545e97926ab53e0617c5e4b639 | |
data = pd.read_csv('your/path/to/the/raw/data.csv')[ | |
['user_id', 'time_install', 'event_name', 'time_event']] | |
# Start with making sure that time_event and time_insrall are Pandas Datetime types: | |
data['time_event'] = pd.to_datetime(data['time_event']) | |
data['time_install'] = pd.to_datetime(data['time_install']) | |
# Make sure that there's no event occurring before time_install | |
data = data[data.time_event >= data.time_install] | |
# The initial data Pandas DataFrame must have these 4 columns: | |
# user_id | time_install | event_name | time_event | |
# - user_id (string): the unique identifier of a user | |
# - time_install (Pandas datetime): the time when the user installed the app (there should be 1 time_install per user_id) | |
# - event_name (string): the name of a specific in-app event (there can be many event_name per user_id) | |
# - time_event (Pandas datetime): the time of each event (there should be 1 time_event per user_id) | |
# Edit this dataframe so that installs are passed as events | |
# Create a new DF from the data DF containing only install data | |
installs = data[['user_id', 'time_install']].sort_values( | |
'time_install').drop_duplicates('user_id') | |
# Create an install column containing dummy "install" events | |
installs['event_name'] = 'install' | |
# Create an event_type column to keep the information of install vs other events | |
installs['event_type'] = 'install' | |
# Rename time_install to time_event | |
installs.rename(columns={'time_install': 'time_event'}, inplace=True) | |
# In the data DF, keep only events data and create the event_type column | |
data = data[['user_id', 'event_name', | |
'time_event']].drop_duplicates() | |
data['event_type'] = 'in_app_action' | |
# Concatenate the two DataFrames | |
data = pd.concat([data, installs[data.columns]]) | |
# Based on the time of events, we can compute the rank of each action at the user_id level: | |
# a) Sort ascendingly per user_id and time_event | |
# sort by event_type to make sure installs come first | |
data.sort_values(['user_id', 'event_type', 'time_event'], | |
ascending=[True, False, True], inplace=True) | |
# b) Group by user_id | |
grouped = data.groupby('user_id') | |
# c) Define a ranking function based on time_event, using the method = 'first' param to ensure no events have the same rank | |
def rank(x): return x['time_event'].rank(method='first').astype(int) | |
# d) Apply the ranking function to the data DF into a new "rank_event" column | |
data["rank_event"] = grouped.apply(rank).reset_index(0, drop=True) | |
# Add, each row, the information about the next_event | |
# a) Regroup by user_id | |
grouped = data.groupby('user_id') | |
# b) The shift function allows to access the next row's data. Here, we'll want the event name | |
def get_next_event(x): return x['event_name'].shift(-1) | |
# c) Apply the function into a new "next_event" column | |
data["next_event"] = grouped.apply( | |
lambda x: get_next_event(x)).reset_index(0, drop=True) | |
# Likewise, we can compute time from each event to its next event: | |
# a) Regroup by user_id | |
grouped = data.groupby('user_id') | |
# b) We make use one more time of the shift function: | |
def get_time_diff( | |
x): return x['time_event'].shift(-1) - x['time_event'] | |
# c) Apply the function to the data DF into a new "time_to_next" column | |
data["time_to_next"] = grouped.apply( | |
lambda x: get_time_diff(x)).reset_index(0, drop=True) | |
# Here we'll plot the journey up to the 10th action. This can be achieved by filtering the dataframe based on the rank_event column that we computed: | |
data = data[data.rank_event < 10] | |
# Check that you have only installs at rank 1: | |
data[data['rank_event'] == 1].event_name.unique() | |
# Working on the nodes_dict | |
all_events = list(data.event_name.unique()) | |
# Create a set of colors that you'd like to use in your plot. | |
palette = ['50BE97', 'E4655C', 'FCC865', | |
'BFD6DE', '3E5066', '353A3E', 'E6E6E6'] | |
# Here, I passed the colors as hex, but we need to pass it as RGB. This loop will do: | |
for i, col in enumerate(palette): | |
palette[i] = tuple(int(col[i:i+2], 16) for i in (0, 2, 4)) | |
# Append a Seaborn complementary palette to your palette in case you did not provide enough colors to style every event | |
complementary_palette = sns.color_palette( | |
"deep", len(all_events) - len(palette)) | |
if len(complementary_palette) > 0: | |
palette.extend(complementary_palette) | |
output = dict() | |
output.update({'nodes_dict': dict()}) | |
i = 0 | |
for rank_event in data.rank_event.unique(): # For each rank of event... | |
# Create a new key equal to the rank... | |
output['nodes_dict'].update( | |
{rank_event: dict()} | |
) | |
# Look at all the events that were done at this step of the funnel... | |
all_events_at_this_rank = data[data.rank_event == | |
rank_event].event_name.unique() | |
# Read the colors for these events and store them in a list... | |
rank_palette = [] | |
for event in all_events_at_this_rank: | |
rank_palette.append(palette[list(all_events).index(event)]) | |
# Keep trace of the events' names, colors and indices. | |
output['nodes_dict'][rank_event].update( | |
{ | |
'sources': list(all_events_at_this_rank), | |
'color': rank_palette, | |
'sources_index': list(range(i, i+len(all_events_at_this_rank))) | |
} | |
) | |
# Finally, increment by the length of this rank's available events to make sure next indices will not be chosen from existing ones | |
i += len(output['nodes_dict'][rank_event]['sources_index']) | |
# Working on the links_dict | |
output.update({'links_dict': dict()}) | |
# Group the DataFrame by user_id and rank_event | |
grouped = data.groupby(['user_id', 'rank_event']) | |
# Define a function to read the souces, targets, values and time from event to next_event: | |
def update_source_target(user): | |
try: | |
source_index = output['nodes_dict'][user.name[1]]['sources_index'][output['nodes_dict'] | |
[user.name[1]]['sources'].index(user['event_name'].values[0])] | |
target_index = output['nodes_dict'][user.name[1] + 1]['sources_index'][output['nodes_dict'] | |
[user.name[1] + 1]['sources'].index(user['next_event'].values[0])] | |
if source_index in output['links_dict']: | |
if target_index in output['links_dict'][source_index]: | |
output['links_dict'][source_index][target_index]['unique_users'] += 1 | |
output['links_dict'][source_index][target_index]['avg_time_to_next'] += user['time_to_next'].values[0] | |
else: | |
output['links_dict'][source_index].update({target_index: | |
dict( | |
{'unique_users': 1, | |
'avg_time_to_next': user['time_to_next'].values[0]} | |
) | |
}) | |
else: | |
output['links_dict'].update({source_index: dict({target_index: dict( | |
{'unique_users': 1, 'avg_time_to_next': user['time_to_next'].values[0]})})}) | |
except Exception as e: | |
pass | |
# Apply the function to your grouped Pandas object: | |
grouped.apply(lambda user: update_source_target(user)) | |
targets = [] | |
sources = [] | |
values = [] | |
time_to_next = [] | |
for source_key, source_value in output['links_dict'].items(): | |
for target_key, target_value in output['links_dict'][source_key].items(): | |
sources.append(source_key) | |
targets.append(target_key) | |
values.append(target_value['unique_users']) | |
time_to_next.append(str(pd.to_timedelta( | |
target_value['avg_time_to_next'] / target_value['unique_users'])).split('.')[0]) # Split to remove the milliseconds information | |
labels = [] | |
colors = [] | |
for key, value in output['nodes_dict'].items(): | |
labels = labels + list(output['nodes_dict'][key]['sources']) | |
colors = colors + list(output['nodes_dict'][key]['color']) | |
for idx, color in enumerate(colors): | |
colors[idx] = "rgb" + str(color) + "" | |
fig = go.Figure(data=[go.Sankey( | |
node=dict( | |
thickness=10, # default is 20 | |
line=dict(color="black", width=0.5), | |
label=labels, | |
color=colors | |
), | |
link=dict( | |
source=sources, | |
target=targets, | |
value=values, | |
label=time_to_next, | |
hovertemplate='%{value} unique users went from %{source.label} to %{target.label}.<br />' + | |
'<br />It took them %{label} in average.<extra></extra>', | |
))]) | |
fig.update_layout(autosize=True, title_text="Medium app", | |
font=dict(size=15), plot_bgcolor='white') | |
publish_to_web = True | |
if publish_to_web: | |
py.iplot(fig, filename='user_journey') | |
else: | |
fig.show(renderer='chrome') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment