-
-
Save markrittman/b64c6dc04e39886c8b7870f357e7300f to your computer and use it in GitHub Desktop.
WITH | |
events AS ( | |
SELECT | |
TIMESTAMP_MICROS(event_timestamp) AS event_ts, | |
CONCAT(user_pseudo_id,'-',event_name,'-',CAST(event_timestamp AS STRING)) AS event_id, | |
user_pseudo_id AS user_pseudo_id, | |
user_id, | |
traffic_source.name AS utm_channel, | |
traffic_source.medium AS utm_medium, | |
traffic_source.source AS utm_source, | |
event_name AS event_type, | |
( | |
SELECT | |
value.int_value | |
FROM | |
UNNEST(event_params) | |
WHERE | |
KEY = 'ga_session_id') AS session_id, | |
( | |
SELECT | |
value.int_value | |
FROM | |
UNNEST(event_params) | |
WHERE | |
KEY = 'ga_session_number') AS session_number, | |
( | |
SELECT | |
value.string_value | |
FROM | |
UNNEST(event_params) | |
WHERE | |
KEY = 'page_referrer') AS referrer_host, | |
( | |
SELECT | |
value.string_value | |
FROM | |
UNNEST(event_params) | |
WHERE | |
KEY = 'page_location') AS page_path, | |
( | |
SELECT | |
value.string_value | |
FROM | |
UNNEST(event_params) | |
WHERE | |
event_name = 'page_view' | |
AND KEY = 'page_title') AS page_title, | |
ecommerce.purchase_revenue AS order_value, | |
ecommerce.transaction_id AS order_id, | |
platform AS channel, | |
device.category AS device_category, | |
device.operating_system, | |
device.language, | |
device.is_limited_ad_tracking, | |
NULL AS browser, | |
NULL AS hostname, | |
geo.continent, | |
geo.country, | |
geo.region, | |
geo.city | |
FROM | |
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` -- modify to your project | |
), | |
id_stitching AS ( | |
SELECT | |
DISTINCT user_pseudo_id AS user_pseudo_id, | |
LAST_VALUE(user_id IGNORE NULLS) OVER (PARTITION BY user_pseudo_id ORDER BY event_ts ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS user_id, | |
MIN(event_ts) OVER (PARTITION BY user_pseudo_id ) AS first_seen_at, | |
MAX(event_ts) OVER (PARTITION BY user_pseudo_id ) AS last_seen_at | |
FROM | |
events), | |
sessions AS ( | |
SELECT | |
user_pseudo_id, | |
TIMESTAMP_MICROS(event_timestamp) AS session_start_ts, | |
CAST(LEAD(TIMESTAMP_MICROS(event_timestamp),1) OVER (PARTITION BY CONCAT(user_pseudo_id) | |
ORDER BY | |
event_timestamp) AS timestamp) AS session_end_ts, | |
( | |
SELECT | |
value.int_value | |
FROM | |
UNNEST(event_params) | |
WHERE | |
KEY = 'ga_session_id') AS session_id, | |
( | |
SELECT | |
value.int_value | |
FROM | |
UNNEST(event_params) | |
WHERE | |
KEY = 'ga_session_number') AS session_number, | |
( | |
SELECT | |
value.string_value | |
FROM | |
UNNEST(event_params) | |
WHERE | |
KEY = 'page_referrer') AS referrer_host, | |
( | |
SELECT | |
value.string_value | |
FROM | |
UNNEST(event_params) | |
WHERE | |
KEY = 'page_location') AS landing_page_path, | |
( | |
SELECT | |
value.string_value | |
FROM | |
UNNEST(event_params) | |
WHERE | |
event_name = 'page_view' | |
AND KEY = 'page_title') AS landing_page_title, | |
traffic_source.name AS utm_campaign, | |
traffic_source.medium AS utm_medium, | |
traffic_source.source AS utm_source, | |
platform AS channel, | |
CASE | |
WHEN device.category = "desktop" THEN "desktop" | |
WHEN device.category = "tablet" | |
AND app_info.id IS NULL THEN "tablet-web" | |
WHEN device.category = "mobile" AND app_info.id IS NULL THEN "mobile-web" | |
WHEN device.category = "tablet" | |
AND app_info.id IS NOT NULL THEN "tablet-app" | |
WHEN device.category = "mobile" AND app_info.id IS NOT NULL THEN "mobile-app" | |
END | |
AS device, | |
device.mobile_brand_name mobile_brand_name, | |
device.mobile_model_name mobile_model_name, | |
device.mobile_marketing_name mobile_marketing_name, | |
device.mobile_os_hardware_model mobile_os_hardware_model, | |
device.operating_system operating_system, | |
device.operating_system_version operating_system_version, | |
device.vendor_id vendor_id, | |
device.advertising_id advertising_id, | |
device.language LANGUAGE, | |
device.is_limited_ad_tracking is_limited_ad_tracking, | |
device.time_zone_offset_seconds, | |
NULL AS browser, | |
NULL AS browser_version, | |
NULL AS browser, | |
device.web_info.browser_version, | |
NULL AS hostname, | |
geo.continent continent, | |
geo.country country, | |
geo.region region, | |
geo.city city, | |
COUNT(DISTINCT CONCAT(user_pseudo_id,'-',event_name,'-',CAST(event_timestamp AS STRING))) OVER (PARTITION BY (SELECT value.int_value FROM UNNEST(event_params) | |
WHERE | |
KEY = 'ga_session_id')) AS events | |
FROM | |
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` s -- modify to your project | |
WHERE | |
event_name = 'session_start' ), | |
user_stitched_sessions AS ( | |
SELECT | |
sessions.*, | |
COALESCE(id_stitching.user_id, sessions.user_pseudo_id) AS blended_user_id | |
FROM | |
sessions | |
LEFT JOIN | |
id_stitching | |
USING | |
(user_pseudo_id) ), | |
user_stitched_events AS ( | |
SELECT | |
events.*, | |
COALESCE(id_stitching.user_id, events.user_pseudo_id) AS blended_user_id | |
FROM | |
events | |
LEFT JOIN | |
id_stitching | |
USING | |
(user_pseudo_id) ), | |
events_filtered AS ( | |
SELECT | |
* | |
FROM ( | |
SELECT | |
*, | |
FIRST_VALUE(CASE | |
WHEN event_type = 'add_payment_info' THEN event_id | |
END | |
IGNORE NULLS) OVER (PARTITION BY blended_user_id ORDER BY event_ts ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS first_registration_event_id, | |
FIRST_VALUE(CASE | |
WHEN event_type='purchase' THEN event_id | |
END | |
IGNORE NULLS) OVER (PARTITION BY blended_user_id ORDER BY event_ts ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS first_order_event_id | |
FROM | |
user_stitched_events ) | |
WHERE | |
event_type = 'purchase' | |
OR (event_type='add_payment_info' | |
AND event_id = first_registration_event_id) ), | |
converting_events AS ( | |
SELECT | |
e.blended_user_id, | |
session_id, | |
event_type, | |
order_id AS order_id, | |
CASE | |
WHEN event_type='purchase' AND event_id = first_order_event_id THEN order_value | |
ELSE | |
0 | |
END | |
AS first_order_revenue, | |
CASE | |
WHEN event_type='purchase' AND event_id != first_order_event_id THEN order_value | |
ELSE | |
0 | |
END | |
AS repeat_order_revenue, | |
CASE | |
WHEN event_type IN ('purchase' ) THEN 1 | |
ELSE | |
0 | |
END | |
AS count_conversions, | |
CASE | |
WHEN event_type='purchase' AND event_id = first_order_event_id THEN 1 | |
ELSE | |
0 | |
END | |
AS count_first_order_conversions, | |
CASE | |
WHEN event_type='purchase' AND event_id != first_order_event_id THEN 1 | |
ELSE | |
0 | |
END | |
AS count_repeat_order_conversions, | |
CASE | |
WHEN event_type = 'purchase' THEN 1 | |
ELSE | |
0 | |
END | |
AS count_order_conversions, | |
CASE | |
WHEN event_type='add_payment_info' AND event_id = first_registration_event_id THEN 1 | |
ELSE | |
0 | |
END | |
AS count_registration_conversions, | |
event_ts AS converted_ts | |
FROM | |
events_filtered e ), | |
converting_sessions_deduped AS ( | |
SELECT | |
session_id AS session_id, | |
MAX(blended_user_id) AS blended_user_id, | |
/* note that because a session could in-theory contain account opening, first order and multiple repeat order events (conversions) within the same session, we have to aggregate the value of those conversions when working at the session level */ | |
SUM(first_order_revenue) AS first_order_revenue, | |
SUM(repeat_order_revenue) AS repeat_order_revenue, | |
SUM(count_first_order_conversions) AS count_first_order_conversions, | |
SUM(count_repeat_order_conversions) AS count_repeat_order_conversions, | |
SUM(count_order_conversions) AS count_order_conversions, | |
SUM(count_registration_conversions) AS count_registration_conversions, | |
SUM(count_registration_conversions) + SUM(count_first_order_conversions) + SUM(count_repeat_order_conversions) AS count_conversions, | |
MAX(converted_ts) AS converted_ts, | |
MIN(converted_ts) AS min_converted_ts | |
FROM | |
converting_events | |
GROUP BY | |
1 ), | |
/* Combine (join) those conversion sessions with all of the sessions that led-up to those conversions */ | |
touchpoint_and_converting_sessions_labelled AS ( | |
SELECT | |
* | |
FROM ( | |
SELECT | |
*, | |
FIRST_VALUE(converted_ts IGNORE NULLS) OVER (PARTITION BY blended_user_id ORDER BY session_start_ts ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS conversion_cycle_conversion_ts, | |
-- used later on to calculate days to conversion | |
ROW_NUMBER() OVER (PARTITION BY blended_user_id ORDER BY session_start_ts) AS session_seq | |
FROM ( | |
SELECT | |
s.blended_user_id AS blended_user_id, | |
s.session_id AS session_id, | |
s.session_start_ts AS session_start_ts, | |
s.session_end_ts AS session_end_ts, | |
c.converted_ts AS converted_ts, | |
c.min_converted_ts AS min_converted_ts, | |
COALESCE(SUM(c.count_conversions),0) AS count_conversions, | |
COALESCE(SUM(c.count_order_conversions),0) AS count_order_conversions, | |
COALESCE(SUM(c.count_first_order_conversions),0) AS count_first_order_conversions, | |
COALESCE(SUM(c.count_repeat_order_conversions),0) AS count_repeat_order_conversions, | |
COALESCE(SUM(c.count_registration_conversions),0) AS count_registration_conversions, | |
COALESCE(CASE | |
WHEN c.count_conversions >0 THEN TRUE | |
ELSE | |
FALSE | |
END | |
,FALSE) AS conversion_session, | |
COALESCE(CASE | |
WHEN c.count_conversions >0 THEN 1 | |
ELSE | |
0 | |
END | |
,0) AS conversion_event, | |
--used when calculating the conversion cycle number | |
COALESCE(CASE | |
WHEN c.count_order_conversions>0 THEN 1 | |
ELSE | |
0 | |
END | |
,0) AS order_conversion_event, | |
--used when calculating the order converion cycle number | |
COALESCE(CASE | |
WHEN c.count_registration_conversions>0 THEN 1 | |
ELSE | |
0 | |
END | |
,0) AS registration_conversion_event, | |
-- see above | |
COALESCE(CASE | |
WHEN c.count_first_order_conversions>0 THEN 1 | |
ELSE | |
0 | |
END | |
,0) AS first_order_conversion_event, | |
-- ditto | |
COALESCE(CASE | |
WHEN c.count_repeat_order_conversions>0 THEN 1 | |
ELSE | |
0 | |
END | |
,0) AS repeat_order_conversion_event, | |
-- ditto | |
utm_source AS utm_source, | |
CAST(NULL AS string) AS utm_content, | |
utm_medium AS utm_medium, | |
utm_campaign AS utm_campaign, | |
referrer_host AS referrer_host, | |
channel AS channel, | |
CASE | |
WHEN LOWER(utm_source) IN ('(direct)', '(data deleted)', '<other>') THEN FALSE | |
ELSE | |
TRUE | |
END | |
AS is_non_direct_channel, | |
CASE | |
WHEN LOWER(utm_medium) LIKE '%paid%' THEN TRUE | |
ELSE | |
FALSE | |
END | |
AS is_paid_channel, | |
events AS events, | |
c.first_order_revenue, | |
c.repeat_order_revenue, | |
city, | |
continent, | |
country, | |
region | |
FROM | |
user_stitched_sessions s | |
LEFT JOIN | |
converting_sessions_deduped c | |
ON | |
s.session_id = c.session_id | |
GROUP BY | |
1, | |
2, | |
3, | |
4, | |
5, | |
6, | |
12, | |
13, | |
14, | |
15, | |
16, | |
17, | |
18, | |
19, | |
20, | |
21, | |
22, | |
23, | |
24, | |
25, | |
26, | |
27, | |
28, | |
29, | |
30, | |
31, | |
32) ) | |
WHERE | |
conversion_cycle_conversion_ts >= session_start_ts ), | |
/* This is a multi-cycle attribution model which means that we attribute the value of each order placed by a user to the sessions that led to that order, starting with the session after the last order | |
We therefore need to split each users' sessions into "conversion cycles", the sessions leading-up to and potentially including the session in which the conversion happened. This next CTE starts this | |
process of calculating those conversion cycles by first calculating, for each session for each user, how many conversions of each type have been recorded for that user at the time the session started | |
by summing the number of conversions recorded in the rows (sessions) up to and including the current row (session) */ | |
touchpoint_and_converting_sessions_labelled_with_conversion_number AS ( | |
SELECT | |
*, | |
SUM(conversion_event) OVER (PARTITION BY blended_user_id ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) AS user_total_conversions, | |
SUM(count_order_conversions) OVER (PARTITION BY blended_user_id ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) AS user_total_order_conversions, | |
SUM(count_registration_conversions) OVER (PARTITION BY blended_user_id ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) AS user_total_registration_conversions, | |
SUM(count_first_order_conversions) OVER (PARTITION BY blended_user_id ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) AS user_total_first_order_conversions, | |
SUM(count_repeat_order_conversions) OVER (PARTITION BY blended_user_id ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) AS user_total_repeat_order_conversions | |
FROM | |
touchpoint_and_converting_sessions_labelled ), | |
/* A conversion cycle is defined as all sessions (rows) leading up-to and including the conversion session (conversion cycle #1), with the conversion cycle then incrementing to conversion cycle #2 for the rows leading up to the next conversion, then we're on to conversion cycle #3, and so on. | |
There can only be one conversion cycle for user registration conversions, and the same is true for first order conversions. Repeat order conversions start at conversion cycle #1 (if the user has made their second order) and then increment to #2 for the users' third order, etc. This block of code calculates which conversion cycle each row (session) is within for each of the conversion cycle types */ | |
touchpoint_and_converting_sessions_labelled_with_conversion_number_and_conversion_cycles AS ( | |
SELECT | |
*, | |
CASE | |
WHEN registration_conversion_event = 0 THEN MAX(COALESCE(user_total_registration_conversions,0)) OVER (PARTITION BY blended_user_id ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) + 1 | |
ELSE | |
MAX(user_total_registration_conversions) OVER (PARTITION BY blended_user_id ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) | |
END | |
AS user_registration_conversion_cycle, | |
CASE | |
WHEN conversion_event = 0 THEN MAX(COALESCE(user_total_conversions,0)) OVER (PARTITION BY blended_user_id ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) + 1 | |
ELSE | |
MAX(user_total_conversions) OVER (PARTITION BY blended_user_id ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) | |
END | |
AS user_conversion_cycle, | |
CASE | |
WHEN first_order_conversion_event = 0 THEN MAX(COALESCE(user_total_first_order_conversions,0)) OVER (PARTITION BY blended_user_id ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) + 1 | |
ELSE | |
MAX(user_total_first_order_conversions) OVER (PARTITION BY blended_user_id ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) | |
END | |
AS user_first_order_conversion_cycle, | |
CASE | |
WHEN repeat_order_conversion_event = 0 THEN MAX(COALESCE(user_total_repeat_order_conversions,0)) OVER (PARTITION BY blended_user_id ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) + 1 | |
ELSE | |
MAX(user_total_repeat_order_conversions) OVER (PARTITION BY blended_user_id ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW ) | |
END | |
AS user_repeat_order_conversion_cycle | |
FROM | |
touchpoint_and_converting_sessions_labelled_with_conversion_number ), | |
/* As we only consider rows (sessions) within a certain number of days before each conversion happened (the "lookback window") we first have to calculate a day number for each row. | |
We do this by reference to a set starting date, arbitrarily chosen (2018-01-01) with the assumption that it's earlier than any conversion we need to attribute value for; | |
we could also achieve the same result (turning date into a number) via the Unix date or a similar date>integer transformation */ | |
touchpoint_and_converting_sessions_labelled_with_conversion_number_and_conversion_cycles_and_day_number AS ( | |
SELECT | |
*, | |
(DATE_DIFF(DATE(session_start_ts),DATE('2018-01-01'),DAY)) AS session_day_number | |
FROM | |
touchpoint_and_converting_sessions_labelled_with_conversion_number_and_conversion_cycles ), | |
/* now we calculate how many days before the next conversion each row (session) is, | |
and then determine, based on a variable set in the dbt_project.yml file, whether the row (session) is within | |
the regular attribution, and time decay attribution, look-back windows */ | |
days_to_each_conversion AS ( | |
SELECT | |
*, | |
MAX(session_day_number) OVER (PARTITION BY blended_user_id, user_conversion_cycle) - session_day_number AS days_before_conversion, | |
(MAX(session_day_number) OVER (PARTITION BY blended_user_id, user_conversion_cycle) - session_day_number )<= 30 AS is_within_attribution_lookback_window, | |
(MAX(session_day_number) OVER (PARTITION BY blended_user_id, user_conversion_cycle) - session_day_number ) <= 7 AS is_within_attribution_time_decay_days_window | |
FROM | |
touchpoint_and_converting_sessions_labelled_with_conversion_number_and_conversion_cycles_and_day_number ), | |
/* Time-decay attribution is a multi-touch attribution model that gives some credit to all the channels that led to your customer converting, | |
with that amount of credit being less (decaying) the further back in time the channel was interacted with. | |
This CTE calculates the various numbers we need as inputs to the time decay calculation */ | |
add_time_decay_score AS ( | |
SELECT | |
*, | |
IF | |
(is_within_attribution_time_decay_days_window, POW(2,days_before_conversion-1)/NULLIF(7,0),NULL) AS time_decay_score, | |
IF | |
(conversion_session,1,POW(2, (days_before_conversion - 1))) AS weighting, | |
IF | |
(conversion_session,1,(COUNT(CASE | |
WHEN NOT conversion_session OR TRUE THEN session_id | |
END | |
) OVER (PARTITION BY blended_user_id, DATE_TRUNC(CAST(session_start_ts AS date),DAY)))) AS sessions_within_day_to_conversion, | |
IF | |
(conversion_session,1,safe_divide (POW(2, (days_before_conversion - 1)), | |
COUNT(CASE | |
WHEN NOT conversion_session OR TRUE THEN session_id | |
END | |
) OVER (PARTITION BY blended_user_id, DATE_TRUNC(CAST(session_start_ts AS date),DAY)))) AS weighting_split_by_days_sessions | |
FROM | |
days_to_each_conversion ), | |
/* Because time-decay attribution adjusts the attributed value of the conversion by looking at the DAYS before the conversion happened and not the rows (sessions) before the conversion, | |
the time decay attribution score calculated in the CTE above will end-up assigning the value of that days attributed conversions to all of the rows (sessions) recorded for that day, | |
of which there may well be more than one for each day. So we then split the value of that day's conversion across the sessions within that day, equally, so we don't end-up over-counting time decay conversions */ | |
split_time_decay_score_across_days_sessions AS ( | |
SELECT | |
*, | |
time_decay_score/NULLIF(sessions_within_day_to_conversion,0) AS apportioned_time_decay_score | |
FROM | |
add_time_decay_score ), | |
/* Calculate the first/last non-direct/paid sessions in each conversion cycle | |
and all conversion cycles that include non-direct and paid channel sessions. | |
These flags are used in the actual session attribution calculations in the next CTE */ /* "and not True" excludes the session from any attribution of the conversion. | |
Option is set via the "attribution_include_conversion_session: true" variable definition in the dbt_project.yml config file. | |
Default value of "true" has the effect of including the actual session that the conversion happened in within the set of sessions eligable for attribution. | |
Reason for including this option is because for sessions generated by dbt (for the custom transactions and account openings that couldn't be linked to a session) | |
you might want to exclude these from having conversions all or partly attributed to them as they couldn't possibly have marketing channel information recorded for them. | |
However the option to exclude them (setting this variable to "false") has not been enabled, so they are included in-scope for attribution (unless subsequently it's set to "false") */ | |
attrib_calc_flags AS ( | |
SELECT | |
*, | |
IF | |
(FIRST_VALUE(CASE | |
WHEN is_within_attribution_lookback_window AND is_non_direct_channel = TRUE THEN session_id | |
END | |
IGNORE NULLS) OVER (PARTITION BY blended_user_id, user_conversion_cycle ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) = session_id,TRUE,FALSE) AS is_first_non_direct_channel_in_conversion_cycle, | |
IF | |
(LAST_VALUE(CASE | |
WHEN is_within_attribution_lookback_window AND is_non_direct_channel = TRUE THEN session_id | |
END | |
IGNORE NULLS) OVER (PARTITION BY blended_user_id, user_conversion_cycle ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)=session_id,TRUE,FALSE) AS is_last_non_direct_channel_in_conversion_cycle, | |
IF | |
(SUM(CASE | |
WHEN is_within_attribution_lookback_window AND is_non_direct_channel = TRUE THEN 1 | |
END | |
) OVER (PARTITION BY blended_user_id, user_conversion_cycle ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)>0,TRUE,FALSE) AS is_conversion_cycle_with_non_direct, | |
IF | |
(FIRST_VALUE(CASE | |
WHEN is_within_attribution_lookback_window AND is_paid_channel = TRUE THEN session_id | |
END | |
IGNORE NULLS) OVER (PARTITION BY blended_user_id, user_conversion_cycle ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)=session_id,TRUE,FALSE) AS is_first_paid_channel_in_conversion_cycle, | |
IF | |
(LAST_VALUE(CASE | |
WHEN is_within_attribution_lookback_window AND is_paid_channel = TRUE THEN session_id | |
END | |
IGNORE NULLS) OVER (PARTITION BY blended_user_id, user_conversion_cycle ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)=session_id,TRUE,FALSE) AS is_last_paid_channel_in_conversion_cycle, | |
IF | |
(SUM(CASE | |
WHEN is_within_attribution_lookback_window AND is_paid_channel = TRUE THEN 1 | |
END | |
) OVER (PARTITION BY blended_user_id, user_conversion_cycle ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)>0,TRUE,FALSE) AS is_conversion_cycle_with_paid | |
FROM | |
split_time_decay_score_across_days_sessions ), | |
session_attrib_pct AS ( | |
SELECT | |
*, | |
IF | |
(conversion_session | |
AND NOT TRUE,0, | |
CASE | |
WHEN session_id = LAST_VALUE( IF (is_within_attribution_lookback_window,session_id,NULL) IGNORE NULLS) OVER (PARTITION BY blended_user_id, user_conversion_cycle ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) THEN 1 | |
ELSE | |
0 | |
END | |
) AS last_click_attrib_pct, | |
IF | |
(conversion_session | |
AND NOT TRUE,0, | |
CASE | |
WHEN is_last_non_direct_channel_in_conversion_cycle THEN 1 -- if the session is the last qualifying session in the conversion cycle, i.e. last non-direct session, then allocate 100% of conversion to it | |
WHEN | |
IF | |
(NOT is_conversion_cycle_with_non_direct | |
AND session_id = LAST_VALUE( | |
IF | |
(is_within_attribution_lookback_window,session_id,NULL) IGNORE NULLS) OVER (PARTITION BY blended_user_id, user_conversion_cycle ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING),TRUE,FALSE) = TRUE THEN 1 -- else if there are no non-direct channel sessions in the conversion cycle AND this the last session in that conversion cycle, allocate 100% of the conversion to it | |
ELSE | |
0 -- else allocate 0% | |
END | |
) AS last_non_direct_click_attrib_pct, | |
IF | |
(conversion_session | |
AND NOT TRUE,0, | |
CASE | |
WHEN is_last_paid_channel_in_conversion_cycle THEN 1 -- if the session is the last qualifying session in the conversion cycle, i.e. last paid session, then allocate 100% of conversion to it | |
WHEN | |
IF | |
(NOT is_conversion_cycle_with_paid | |
AND session_id = LAST_VALUE( | |
IF | |
(is_within_attribution_lookback_window,session_id,NULL) IGNORE NULLS) OVER (PARTITION BY blended_user_id, user_conversion_cycle ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING),TRUE,FALSE) = TRUE THEN 1 -- else if there are no paid channel sessions in the conversion cycle AND this the last session in that conversion cycle, allocate 100% of the conversion to it | |
ELSE | |
0 -- else allocate 0% | |
END | |
) AS last_paid_click_attrib_pct, | |
IF | |
(conversion_session | |
AND NOT TRUE,0, | |
CASE | |
WHEN session_id = FIRST_VALUE( IF (is_within_attribution_lookback_window,session_id,NULL) IGNORE NULLS) OVER (PARTITION BY blended_user_id, user_conversion_cycle ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) THEN 1 | |
ELSE | |
0 | |
END | |
) AS first_click_attrib_pct, | |
IF | |
(conversion_session | |
AND NOT TRUE,0, | |
CASE | |
WHEN is_first_non_direct_channel_in_conversion_cycle THEN 1 -- if the session is the first qualifying session in the conversion cycle, i.e. first non-direct session, then allocate 100% of conversion to it | |
WHEN | |
IF | |
(NOT is_conversion_cycle_with_non_direct | |
AND session_id = FIRST_VALUE( | |
IF | |
(is_within_attribution_lookback_window,session_id,NULL) IGNORE NULLS) OVER (PARTITION BY blended_user_id, user_conversion_cycle ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING),TRUE,FALSE) = TRUE THEN 1 -- else if there are no non-direct channel sessions in the conversion cycle AND this the first session in that conversion cycle, allocate 100% of the conversion to it | |
ELSE | |
0 -- else allocate 0% | |
END | |
) AS first_non_direct_click_attrib_pct, | |
IF | |
(conversion_session | |
AND NOT TRUE,0, | |
CASE | |
WHEN is_first_paid_channel_in_conversion_cycle THEN 1 -- if the session is the first qualifying session in the conversion cycle, i.e. first paid session, then allocate 100% of conversion to it | |
WHEN | |
IF | |
(NOT is_conversion_cycle_with_paid | |
AND session_id = FIRST_VALUE( | |
IF | |
(is_within_attribution_lookback_window,session_id,NULL) IGNORE NULLS) OVER (PARTITION BY blended_user_id, user_conversion_cycle ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING),TRUE,FALSE) = TRUE THEN 1 -- else if there are no paid channel sessions in the conversion cycle AND this the first session in that conversion cycle, allocate 100% of the conversion to it | |
ELSE | |
0 -- else allocate 0% | |
END | |
) AS first_paid_click_attrib_pct, | |
IF | |
(conversion_session | |
AND NOT TRUE,0, | |
IF | |
(is_within_attribution_lookback_window,(safe_divide (1, | |
(COUNT( | |
IF | |
(is_within_attribution_lookback_window,session_id,NULL)) OVER (PARTITION BY blended_user_id, user_conversion_cycle ORDER BY session_start_ts ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) + 0))),0) ) AS even_click_attrib_pct, | |
IF | |
(conversion_session | |
AND NOT TRUE,0,CASE | |
WHEN is_within_attribution_time_decay_days_window THEN apportioned_time_decay_score / NULLIF((SUM(apportioned_time_decay_score) OVER (PARTITION BY blended_user_id, user_conversion_cycle)),0) | |
END | |
) AS time_decay_attrib_pct | |
FROM | |
attrib_calc_flags ), | |
/* Now calculate the actual account opening, first order, repeat order and revenue numbers based on the attribution percentages calculated for the session */ | |
/* Max() aggregations are used to find the conversion value that each session-level percentage attribution is then applied to */ | |
final AS ( | |
SELECT | |
'Last Click' AS MODEL, | |
(MAX(count_registration_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* last_click_attrib_pct) AS user_registration_conversions, | |
(MAX(count_first_order_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* last_click_attrib_pct) AS first_order_conversions, | |
(MAX(count_repeat_order_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* last_click_attrib_pct) AS repeat_order_conversions, | |
(MAX(first_order_revenue) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* last_click_attrib_pct) AS first_order_revenue, | |
(MAX(repeat_order_revenue) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* last_click_attrib_pct) AS repeat_order_revenue, | |
blended_user_id, | |
session_id, | |
session_start_ts, | |
session_end_ts, | |
session_seq, | |
user_conversion_cycle, | |
is_first_non_direct_channel_in_conversion_cycle, | |
is_last_non_direct_channel_in_conversion_cycle, | |
is_conversion_cycle_with_non_direct, | |
is_first_paid_channel_in_conversion_cycle, | |
is_last_paid_channel_in_conversion_cycle, | |
is_conversion_cycle_with_paid, | |
is_non_direct_channel, | |
is_paid_channel, | |
conversion_session AS is_conversion_session, | |
utm_source, | |
utm_medium, | |
utm_campaign, | |
referrer_host, | |
channel, | |
city, | |
continent, | |
country, | |
region | |
FROM | |
session_attrib_pct a | |
UNION ALL | |
SELECT | |
'First Click' AS MODEL, | |
(MAX(count_registration_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* first_click_attrib_pct) AS user_registration_conversions, | |
(MAX(count_first_order_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* first_click_attrib_pct) AS first_order_conversions, | |
(MAX(count_repeat_order_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* first_click_attrib_pct) AS repeat_order_conversions, | |
(MAX(first_order_revenue) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* first_click_attrib_pct) AS first_order_revenue, | |
(MAX(repeat_order_revenue) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* first_click_attrib_pct) AS repeat_order_revenue, | |
blended_user_id, | |
session_id, | |
session_start_ts, | |
session_end_ts, | |
session_seq, | |
user_conversion_cycle, | |
is_first_non_direct_channel_in_conversion_cycle, | |
is_last_non_direct_channel_in_conversion_cycle, | |
is_conversion_cycle_with_non_direct, | |
is_first_paid_channel_in_conversion_cycle, | |
is_last_paid_channel_in_conversion_cycle, | |
is_conversion_cycle_with_paid, | |
is_non_direct_channel, | |
is_paid_channel, | |
conversion_session AS is_conversion_session, | |
utm_source, | |
utm_medium, | |
utm_campaign, | |
referrer_host, | |
channel, | |
city, | |
continent, | |
country, | |
region | |
FROM | |
session_attrib_pct a | |
UNION ALL | |
SELECT | |
'Last Non-Direct Click' AS MODEL, | |
(MAX(count_registration_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* last_non_direct_click_attrib_pct) AS user_registration_conversions, | |
(MAX(count_first_order_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* last_non_direct_click_attrib_pct) AS first_order_conversions, | |
(MAX(count_repeat_order_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* last_non_direct_click_attrib_pct) AS repeat_order_conversions, | |
(MAX(first_order_revenue) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* last_non_direct_click_attrib_pct) AS first_order_revenue, | |
(MAX(repeat_order_revenue) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* last_non_direct_click_attrib_pct) AS repeat_order_revenue, | |
blended_user_id, | |
session_id, | |
session_start_ts, | |
session_end_ts, | |
session_seq, | |
user_conversion_cycle, | |
is_first_non_direct_channel_in_conversion_cycle, | |
is_last_non_direct_channel_in_conversion_cycle, | |
is_conversion_cycle_with_non_direct, | |
is_first_paid_channel_in_conversion_cycle, | |
is_last_paid_channel_in_conversion_cycle, | |
is_conversion_cycle_with_paid, | |
is_non_direct_channel, | |
is_paid_channel, | |
conversion_session AS is_conversion_session, | |
utm_source, | |
utm_medium, | |
utm_campaign, | |
referrer_host, | |
channel, | |
city, | |
continent, | |
country, | |
region | |
FROM | |
session_attrib_pct a | |
UNION ALL | |
SELECT | |
'First Paid Click' AS MODEL, | |
(MAX(count_registration_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* first_paid_click_attrib_pct) AS user_registration_conversions, | |
(MAX(count_first_order_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* first_paid_click_attrib_pct) AS first_order_conversions, | |
(MAX(count_repeat_order_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* first_paid_click_attrib_pct) AS repeat_order_conversions, | |
(MAX(first_order_revenue) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* first_paid_click_attrib_pct) AS first_order_revenue, | |
(MAX(repeat_order_revenue) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* first_paid_click_attrib_pct) AS repeat_order_revenue, | |
blended_user_id, | |
session_id, | |
session_start_ts, | |
session_end_ts, | |
session_seq, | |
user_conversion_cycle, | |
is_first_non_direct_channel_in_conversion_cycle, | |
is_last_non_direct_channel_in_conversion_cycle, | |
is_conversion_cycle_with_non_direct, | |
is_first_paid_channel_in_conversion_cycle, | |
is_last_paid_channel_in_conversion_cycle, | |
is_conversion_cycle_with_paid, | |
is_non_direct_channel, | |
is_paid_channel, | |
conversion_session AS is_conversion_session, | |
utm_source, | |
utm_medium, | |
utm_campaign, | |
referrer_host, | |
channel, | |
city, | |
continent, | |
country, | |
region | |
FROM | |
session_attrib_pct a | |
UNION ALL | |
SELECT | |
'Last Paid Click' AS MODEL, | |
(MAX(count_registration_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* last_paid_click_attrib_pct) AS user_registration_conversions, | |
(MAX(count_first_order_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* last_paid_click_attrib_pct) AS first_order_conversions, | |
(MAX(count_repeat_order_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* last_paid_click_attrib_pct) AS repeat_order_conversions, | |
(MAX(first_order_revenue) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* last_paid_click_attrib_pct) AS first_order_revenue, | |
(MAX(repeat_order_revenue) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* last_paid_click_attrib_pct) AS repeat_order_revenue, | |
blended_user_id, | |
session_id, | |
session_start_ts, | |
session_end_ts, | |
session_seq, | |
user_conversion_cycle, | |
is_first_non_direct_channel_in_conversion_cycle, | |
is_last_non_direct_channel_in_conversion_cycle, | |
is_conversion_cycle_with_non_direct, | |
is_first_paid_channel_in_conversion_cycle, | |
is_last_paid_channel_in_conversion_cycle, | |
is_conversion_cycle_with_paid, | |
is_non_direct_channel, | |
is_paid_channel, | |
conversion_session AS is_conversion_session, | |
utm_source, | |
utm_medium, | |
utm_campaign, | |
referrer_host, | |
channel, | |
city, | |
continent, | |
country, | |
region | |
FROM | |
session_attrib_pct a | |
UNION ALL | |
SELECT | |
'Linear' AS MODEL, | |
(MAX(count_registration_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* even_click_attrib_pct) AS user_registration_conversions, | |
(MAX(count_first_order_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* even_click_attrib_pct) AS first_order_conversions, | |
(MAX(count_repeat_order_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* even_click_attrib_pct) AS repeat_order_conversions, | |
(MAX(first_order_revenue) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* even_click_attrib_pct) AS first_order_revenue, | |
(MAX(repeat_order_revenue) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* even_click_attrib_pct) AS repeat_order_revenue, | |
blended_user_id, | |
session_id, | |
session_start_ts, | |
session_end_ts, | |
session_seq, | |
user_conversion_cycle, | |
is_first_non_direct_channel_in_conversion_cycle, | |
is_last_non_direct_channel_in_conversion_cycle, | |
is_conversion_cycle_with_non_direct, | |
is_first_paid_channel_in_conversion_cycle, | |
is_last_paid_channel_in_conversion_cycle, | |
is_conversion_cycle_with_paid, | |
is_non_direct_channel, | |
is_paid_channel, | |
conversion_session AS is_conversion_session, | |
utm_source, | |
utm_medium, | |
utm_campaign, | |
referrer_host, | |
channel, | |
city, | |
continent, | |
country, | |
region | |
FROM | |
session_attrib_pct a | |
UNION ALL | |
SELECT | |
'Time Decay' AS MODEL, | |
(MAX(count_registration_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* time_decay_attrib_pct) AS user_registration_conversions, | |
(MAX(count_first_order_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* time_decay_attrib_pct) AS first_order_conversions, | |
(MAX(count_repeat_order_conversions) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* time_decay_attrib_pct) AS repeat_order_conversions, | |
(MAX(first_order_revenue) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* time_decay_attrib_pct) AS first_order_revenue, | |
(MAX(repeat_order_revenue) OVER (PARTITION BY blended_user_id, user_conversion_cycle)* time_decay_attrib_pct) AS repeat_order_revenue, | |
blended_user_id, | |
session_id, | |
session_start_ts, | |
session_end_ts, | |
session_seq, | |
user_conversion_cycle, | |
is_first_non_direct_channel_in_conversion_cycle, | |
is_last_non_direct_channel_in_conversion_cycle, | |
is_conversion_cycle_with_non_direct, | |
is_first_paid_channel_in_conversion_cycle, | |
is_last_paid_channel_in_conversion_cycle, | |
is_conversion_cycle_with_paid, | |
is_non_direct_channel, | |
is_paid_channel, | |
conversion_session AS is_conversion_session, | |
utm_source, | |
utm_medium, | |
utm_campaign, | |
referrer_host, | |
channel, | |
city, | |
continent, | |
country, | |
region | |
FROM | |
session_attrib_pct a ), | |
pivoted AS ( | |
SELECT | |
MODEL, | |
blended_user_id, | |
session_id, | |
session_start_ts, | |
session_end_ts, | |
session_seq, | |
user_conversion_cycle, | |
is_first_non_direct_channel_in_conversion_cycle, | |
is_last_non_direct_channel_in_conversion_cycle, | |
is_conversion_cycle_with_non_direct, | |
is_first_paid_channel_in_conversion_cycle, | |
is_last_paid_channel_in_conversion_cycle, | |
is_conversion_cycle_with_paid, | |
is_non_direct_channel, | |
is_paid_channel, | |
is_conversion_session, | |
utm_source, | |
utm_medium, | |
utm_campaign, | |
referrer_host, | |
channel, | |
city, | |
continent, | |
country, | |
region, | |
user_registration_conversions, | |
first_order_conversions, | |
repeat_order_conversions, | |
first_order_revenue, | |
repeat_order_revenue | |
FROM | |
final) | |
SELECT | |
* | |
FROM | |
pivoted |
Thanks for this. One question:
shouldn't be enough to take user_pseudo_id and timestamp for the event_id?
I mean, that user in that moment will have only one event type.
CONCAT(user_pseudo_id,'-',event_name,'-',CAST(event_timestamp AS STRING)) AS event_id,
I think yes - but firstly I would check if there is only one pair of event_name-event_timestamp across one user. Maybe You can add session_id as well.
Apparently, I'm wrong with "real" data: GROUP BY on user_pseudo_id, event_name, event_timestamp gives you count(*) > 1
Thanks for sharing, I have a small question: why do you take traffic_source.source for utm_source?
As long as I know this is the first source of a user, assuming that (SELECT value.string_value FROM UNNEST(event_params) WHERE key="source") is the actual source of the session, taking into account that a session can have multiple sources.
Actually I found source data also in events_param and page_location url and in many case fields are alternatively empty. I could not find which is the difference of source at record, events_param and page_location level
Apparently, I'm wrong with "real" data: GROUP BY on user_pseudo_id, event_name, event_timestamp gives you count(*) > 1
Have You analysed more precisely these cases?
You need these:
(SELECT value.string_value FROM UNNEST(event_params) WHERE key="source") as source,
(SELECT value.string_value FROM UNNEST(event_params) WHERE key="medium") as medium,
Yeah this is event_params UTMs.
What I'm saying is that you have also UTM at record level (traffic_source, traffic_medium, traffic_name) and at page_location level (in url, e.g. &utm_source, &utm_medium). Btw they are not always present in all the three places and actually would like to understand the difference among the three.
as exception there might be case
when you have at least one NULL value (any of user_pseudo_id,event_name,event_timestamp) in your string concatenation:
CONCAT(user_pseudo_id,'-',event_name,'-',CAST(event_timestamp AS STRING)) AS event_id
this will return NULL as result for record
there is proof
check it in your bigquery:
select 1,concat(NULL,'a','b') -- NULL
union all
select 2,concat('a',NULL,'b') -- NULL
union all
select 3,concat('a','b',NULL) -- NULL
so if this not excpected
use
CONCAT( IFNULL(user_pseudo_id,'some')
,'-'
,IFNULL(event_name,'thing')
,'-'
,IFNULL(CAST(event_timestamp AS STRING),'happend')
) AS event_id
@KobozevYaroslav has a point