nicolasesnis · April 6, 2020 18:01
diff --git a/full_script_sankey_medium.py b/full_script_sankey_medium.py
 import seaborn as sns
 import pandas as pd
 import plotly.graph_objects as go
 import chart_studio.plotly as py
 import plotly

 # Path to the raw data: https://gist.github.com/nicolasesnis/eb3b35545e97926ab53e0617c5e4b639

 data = pd.read_csv('your/path/to/the/raw/data.csv')[
    ['user_id', 'time_install', 'event_name', 'time_event']]

 # Start with making sure that time_event and time_insrall are Pandas Datetime types:
 data['time_event'] = pd.to_datetime(data['time_event'])
 data['time_install'] = pd.to_datetime(data['time_install'])

 # Make sure that there's no event occurring before time_install
 data = data[data.time_event >= data.time_install]


 # The initial data Pandas DataFrame must have these 4 columns:
 # user_id | time_install | event_name | time_event
 # - user_id (string): the unique identifier of a user
 # - time_install (Pandas datetime): the time when the user installed the app (there should be 1 time_install per user_id)
 # - event_name (string): the name of a specific in-app event (there can be many event_name per user_id)
 # - time_event (Pandas datetime): the time of each event (there should be 1 time_event per user_id)

 # Edit this dataframe so that installs are passed as events

 # Create a new DF from the data DF containing only install data
 installs = data[['user_id', 'time_install']].sort_values(
    'time_install').drop_duplicates('user_id')

 # Create an install column containing dummy "install" events
 installs['event_name'] = 'install'

 # Create an event_type column to keep the information of install vs other events
 installs['event_type'] = 'install'

 # Rename time_install to time_event

 installs.rename(columns={'time_install': 'time_event'}, inplace=True)

 # In the data DF, keep only events data and create the event_type column
 data = data[['user_id', 'event_name',
             'time_event']].drop_duplicates()
 data['event_type'] = 'in_app_action'

 # Concatenate the two DataFrames
 data = pd.concat([data, installs[data.columns]])

 # Based on the time of events, we can compute the rank of each action at the user_id level:

 # a) Sort ascendingly per user_id and time_event
 # sort by event_type to make sure installs come first
 data.sort_values(['user_id', 'event_type', 'time_event'],
                 ascending=[True, False, True], inplace=True)

 # b) Group by user_id
 grouped = data.groupby('user_id')

 # c) Define a ranking function based on time_event, using the method = 'first' param to ensure no events have the same rank


 def rank(x): return x['time_event'].rank(method='first').astype(int)


 # d) Apply the ranking function to the data DF into a new "rank_event" column
 data["rank_event"] = grouped.apply(rank).reset_index(0, drop=True)

 # Add, each row, the information about the next_event

 # a) Regroup by user_id
 grouped = data.groupby('user_id')

 # b) The shift function allows to access the next row's data. Here, we'll want the event name


 def get_next_event(x): return x['event_name'].shift(-1)


 # c) Apply the function into a new "next_event" column
 data["next_event"] = grouped.apply(
    lambda x: get_next_event(x)).reset_index(0, drop=True)

 # Likewise, we can compute time from each event to its next event:

 # a) Regroup by user_id
 grouped = data.groupby('user_id')

 # b) We make use one more time of the shift function:


 def get_time_diff(
    x): return x['time_event'].shift(-1) - x['time_event']


 # c) Apply the function to the data DF into a new "time_to_next" column
 data["time_to_next"] = grouped.apply(
    lambda x: get_time_diff(x)).reset_index(0, drop=True)

 # Here we'll plot the journey up to the 10th action. This can be achieved by filtering the dataframe based on the rank_event column that we computed:
 data = data[data.rank_event < 10]

 # Check that you have only installs at rank 1:
 data[data['rank_event'] == 1].event_name.unique()

 # Working on the nodes_dict

 all_events = list(data.event_name.unique())

 # Create a set of colors that you'd like to use in your plot.
 palette = ['50BE97', 'E4655C', 'FCC865',
           'BFD6DE', '3E5066', '353A3E', 'E6E6E6']
 #  Here, I passed the colors as hex, but we need to pass it as RGB. This loop will do:
 for i, col in enumerate(palette):
    palette[i] = tuple(int(col[i:i+2], 16) for i in (0, 2, 4))

 # Append a Seaborn complementary palette to your palette in case you did not provide enough colors to style every event
 complementary_palette = sns.color_palette(
    "deep", len(all_events) - len(palette))
 if len(complementary_palette) > 0:
    palette.extend(complementary_palette)

 output = dict()
 output.update({'nodes_dict': dict()})

 i = 0
 for rank_event in data.rank_event.unique():  # For each rank of event...
    # Create a new key equal to the rank...
    output['nodes_dict'].update(
        {rank_event: dict()}
    )

    # Look at all the events that were done at this step of the funnel...
    all_events_at_this_rank = data[data.rank_event ==
                                   rank_event].event_name.unique()

    # Read the colors for these events and store them in a list...
    rank_palette = []
    for event in all_events_at_this_rank:
        rank_palette.append(palette[list(all_events).index(event)])

    # Keep trace of the events' names, colors and indices.
    output['nodes_dict'][rank_event].update(
        {
            'sources': list(all_events_at_this_rank),
            'color': rank_palette,
            'sources_index': list(range(i, i+len(all_events_at_this_rank)))
        }
    )
    # Finally, increment by the length of this rank's available events to make sure next indices will not be chosen from existing ones
    i += len(output['nodes_dict'][rank_event]['sources_index'])

 # Working on the links_dict

 output.update({'links_dict': dict()})

 # Group the DataFrame by user_id and rank_event
 grouped = data.groupby(['user_id', 'rank_event'])

 # Define a function to read the souces, targets, values and time from event to next_event:


 def update_source_target(user):
    try:
        source_index = output['nodes_dict'][user.name[1]]['sources_index'][output['nodes_dict']
                                                                           [user.name[1]]['sources'].index(user['event_name'].values[0])]

        target_index = output['nodes_dict'][user.name[1] + 1]['sources_index'][output['nodes_dict']
                                                                               [user.name[1] + 1]['sources'].index(user['next_event'].values[0])]

        if source_index in output['links_dict']:
            if target_index in output['links_dict'][source_index]:

                output['links_dict'][source_index][target_index]['unique_users'] += 1
                output['links_dict'][source_index][target_index]['avg_time_to_next'] += user['time_to_next'].values[0]
            else:

                output['links_dict'][source_index].update({target_index:
                                                           dict(
                                                               {'unique_users': 1,
                                                                'avg_time_to_next': user['time_to_next'].values[0]}
                                                           )
                                                           })
        else:

            output['links_dict'].update({source_index: dict({target_index: dict(
                {'unique_users': 1, 'avg_time_to_next': user['time_to_next'].values[0]})})})
    except Exception as e:
        pass


 # Apply the function to your grouped Pandas object:
 grouped.apply(lambda user: update_source_target(user))


 targets = []
 sources = []
 values = []
 time_to_next = []

 for source_key, source_value in output['links_dict'].items():
    for target_key, target_value in output['links_dict'][source_key].items():
        sources.append(source_key)
        targets.append(target_key)
        values.append(target_value['unique_users'])
        time_to_next.append(str(pd.to_timedelta(
            target_value['avg_time_to_next'] / target_value['unique_users'])).split('.')[0])  # Split to remove the milliseconds information

 labels = []
 colors = []
 for key, value in output['nodes_dict'].items():
    labels = labels + list(output['nodes_dict'][key]['sources'])
    colors = colors + list(output['nodes_dict'][key]['color'])

 for idx, color in enumerate(colors):
    colors[idx] = "rgb" + str(color) + ""

 fig = go.Figure(data=[go.Sankey(
    node=dict(
        thickness=10,  # default is 20
        line=dict(color="black", width=0.5),
        label=labels,
        color=colors
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values,
        label=time_to_next,
        hovertemplate='%{value} unique users went from %{source.label} to %{target.label}.<br />' +
        '<br />It took them %{label} in average.<extra></extra>',
    ))])

 fig.update_layout(autosize=True, title_text="Medium app",
                  font=dict(size=15), plot_bgcolor='white')

 publish_to_web = True
 if publish_to_web:
    py.iplot(fig, filename='user_journey')
 else:
    fig.show(renderer='chrome')
	import seaborn as sns
	import pandas as pd
	import plotly.graph_objects as go
	import chart_studio.plotly as py
	import plotly

	# Path to the raw data: https://gist.github.com/nicolasesnis/eb3b35545e97926ab53e0617c5e4b639

	data = pd.read_csv('your/path/to/the/raw/data.csv')[
	['user_id', 'time_install', 'event_name', 'time_event']]

	# Start with making sure that time_event and time_insrall are Pandas Datetime types:
	data['time_event'] = pd.to_datetime(data['time_event'])
	data['time_install'] = pd.to_datetime(data['time_install'])

	# Make sure that there's no event occurring before time_install
	data = data[data.time_event >= data.time_install]


	# The initial data Pandas DataFrame must have these 4 columns:
	# user_id \| time_install \| event_name \| time_event
	# - user_id (string): the unique identifier of a user
	# - time_install (Pandas datetime): the time when the user installed the app (there should be 1 time_install per user_id)
	# - event_name (string): the name of a specific in-app event (there can be many event_name per user_id)
	# - time_event (Pandas datetime): the time of each event (there should be 1 time_event per user_id)

	# Edit this dataframe so that installs are passed as events

	# Create a new DF from the data DF containing only install data
	installs = data[['user_id', 'time_install']].sort_values(
	'time_install').drop_duplicates('user_id')

	# Create an install column containing dummy "install" events
	installs['event_name'] = 'install'

	# Create an event_type column to keep the information of install vs other events
	installs['event_type'] = 'install'

	# Rename time_install to time_event

	installs.rename(columns={'time_install': 'time_event'}, inplace=True)

	# In the data DF, keep only events data and create the event_type column
	data = data[['user_id', 'event_name',
	'time_event']].drop_duplicates()
	data['event_type'] = 'in_app_action'

	# Concatenate the two DataFrames
	data = pd.concat([data, installs[data.columns]])

	# Based on the time of events, we can compute the rank of each action at the user_id level:

	# a) Sort ascendingly per user_id and time_event
	# sort by event_type to make sure installs come first
	data.sort_values(['user_id', 'event_type', 'time_event'],
	ascending=[True, False, True], inplace=True)

	# b) Group by user_id
	grouped = data.groupby('user_id')

	# c) Define a ranking function based on time_event, using the method = 'first' param to ensure no events have the same rank


	def rank(x): return x['time_event'].rank(method='first').astype(int)


	# d) Apply the ranking function to the data DF into a new "rank_event" column
	data["rank_event"] = grouped.apply(rank).reset_index(0, drop=True)

	# Add, each row, the information about the next_event

	# a) Regroup by user_id
	grouped = data.groupby('user_id')

	# b) The shift function allows to access the next row's data. Here, we'll want the event name


	def get_next_event(x): return x['event_name'].shift(-1)


	# c) Apply the function into a new "next_event" column
	data["next_event"] = grouped.apply(
	lambda x: get_next_event(x)).reset_index(0, drop=True)

	# Likewise, we can compute time from each event to its next event:

	# a) Regroup by user_id
	grouped = data.groupby('user_id')

	# b) We make use one more time of the shift function:


	def get_time_diff(
	x): return x['time_event'].shift(-1) - x['time_event']


	# c) Apply the function to the data DF into a new "time_to_next" column
	data["time_to_next"] = grouped.apply(
	lambda x: get_time_diff(x)).reset_index(0, drop=True)

	# Here we'll plot the journey up to the 10th action. This can be achieved by filtering the dataframe based on the rank_event column that we computed:
	data = data[data.rank_event < 10]

	# Check that you have only installs at rank 1:
	data[data['rank_event'] == 1].event_name.unique()

	# Working on the nodes_dict

	all_events = list(data.event_name.unique())

	# Create a set of colors that you'd like to use in your plot.
	palette = ['50BE97', 'E4655C', 'FCC865',
	'BFD6DE', '3E5066', '353A3E', 'E6E6E6']
	# Here, I passed the colors as hex, but we need to pass it as RGB. This loop will do:
	for i, col in enumerate(palette):
	palette[i] = tuple(int(col[i:i+2], 16) for i in (0, 2, 4))

	# Append a Seaborn complementary palette to your palette in case you did not provide enough colors to style every event
	complementary_palette = sns.color_palette(
	"deep", len(all_events) - len(palette))
	if len(complementary_palette) > 0:
	palette.extend(complementary_palette)

	output = dict()
	output.update({'nodes_dict': dict()})

	i = 0
	for rank_event in data.rank_event.unique(): # For each rank of event...
	# Create a new key equal to the rank...
	output['nodes_dict'].update(
	{rank_event: dict()}
	)

	# Look at all the events that were done at this step of the funnel...
	all_events_at_this_rank = data[data.rank_event ==
	rank_event].event_name.unique()

	# Read the colors for these events and store them in a list...
	rank_palette = []
	for event in all_events_at_this_rank:
	rank_palette.append(palette[list(all_events).index(event)])

	# Keep trace of the events' names, colors and indices.
	output['nodes_dict'][rank_event].update(
	{
	'sources': list(all_events_at_this_rank),
	'color': rank_palette,
	'sources_index': list(range(i, i+len(all_events_at_this_rank)))
	}
	)
	# Finally, increment by the length of this rank's available events to make sure next indices will not be chosen from existing ones
	i += len(output['nodes_dict'][rank_event]['sources_index'])

	# Working on the links_dict

	output.update({'links_dict': dict()})

	# Group the DataFrame by user_id and rank_event
	grouped = data.groupby(['user_id', 'rank_event'])

	# Define a function to read the souces, targets, values and time from event to next_event:


	def update_source_target(user):
	try:
	source_index = output['nodes_dict'][user.name[1]]['sources_index'][output['nodes_dict']
	[user.name[1]]['sources'].index(user['event_name'].values[0])]

	target_index = output['nodes_dict'][user.name[1] + 1]['sources_index'][output['nodes_dict']
	[user.name[1] + 1]['sources'].index(user['next_event'].values[0])]

	if source_index in output['links_dict']:
	if target_index in output['links_dict'][source_index]:

	output['links_dict'][source_index][target_index]['unique_users'] += 1
	output['links_dict'][source_index][target_index]['avg_time_to_next'] += user['time_to_next'].values[0]
	else:

	output['links_dict'][source_index].update({target_index:
	dict(
	{'unique_users': 1,
	'avg_time_to_next': user['time_to_next'].values[0]}
	)
	})
	else:

	output['links_dict'].update({source_index: dict({target_index: dict(
	{'unique_users': 1, 'avg_time_to_next': user['time_to_next'].values[0]})})})
	except Exception as e:
	pass


	# Apply the function to your grouped Pandas object:
	grouped.apply(lambda user: update_source_target(user))


	targets = []
	sources = []
	values = []
	time_to_next = []

	for source_key, source_value in output['links_dict'].items():
	for target_key, target_value in output['links_dict'][source_key].items():
	sources.append(source_key)
	targets.append(target_key)
	values.append(target_value['unique_users'])
	time_to_next.append(str(pd.to_timedelta(
	target_value['avg_time_to_next'] / target_value['unique_users'])).split('.')[0]) # Split to remove the milliseconds information

	labels = []
	colors = []
	for key, value in output['nodes_dict'].items():
	labels = labels + list(output['nodes_dict'][key]['sources'])
	colors = colors + list(output['nodes_dict'][key]['color'])

	for idx, color in enumerate(colors):
	colors[idx] = "rgb" + str(color) + ""

	fig = go.Figure(data=[go.Sankey(
	node=dict(
	thickness=10, # default is 20
	line=dict(color="black", width=0.5),
	label=labels,
	color=colors
	),
	link=dict(
	source=sources,
	target=targets,
	value=values,
	label=time_to_next,
	hovertemplate='%{value} unique users went from %{source.label} to %{target.label}.<br />' +
	'<br />It took them %{label} in average.<extra></extra>',
	))])

	fig.update_layout(autosize=True, title_text="Medium app",
	font=dict(size=15), plot_bgcolor='white')

	publish_to_web = True
	if publish_to_web:
	py.iplot(fig, filename='user_journey')
	else:
	fig.show(renderer='chrome')