Last active
June 13, 2023 13:40
-
-
Save krisjan-oldekamp/b3b98ecc579fa40a8fdc48a84256a2de to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Author: Krisjan Oldekamp | |
-- https://stacktonic.com/article/enrich-a-single-customer-view-with-google-analytics-4-big-query-data | |
declare lookback_window int64 default 365; -- how many days to lookback into the ga4 dataset to calculate profiles | |
-- udf: channel grouping (you could put this in a permanent function) | |
-- also see https://stacktonic.com/article/google-analytics-4-and-big-query-create-custom-channel-groupings-in-a-reusable-sql-function | |
create temporary function channel_grouping(tsource string, medium string, campaign string) as ( | |
case | |
when (tsource = '(direct)' or tsource is null) | |
and (regexp_contains(medium, r'^(\(not set\)|\(none\))$') or medium is null) | |
then 'direct' | |
when regexp_contains(medium, r'^(social|social_advertising|social-advertising|social_network|social-network|social_media|social-media)$') | |
then 'social' | |
when regexp_contains(medium, r'^(email|mail)$') | |
then 'email' | |
when regexp_contains(medium, r'^(affiliate|affiliates)$') | |
then 'affiliate' | |
when regexp_contains(medium, r'^(cpc|ppc|paidsearch)$') | |
then 'search_paid' | |
when regexp_contains(medium, r'^(display|cpm|banner)$') | |
then 'display' | |
when medium = 'organic' | |
then 'search_organic' | |
when medium = 'referral' | |
then 'referral' | |
else '(other)' | |
end | |
); | |
-- udf: most frequent value from array | |
create temp function most_freq(arr any type) as (( | |
select x from unnest(arr) x group by x order by count(1) desc limit 1 | |
)); | |
-- udf: get first channel within session | |
create temp function first_channel_session(arr any type) as (( | |
select | |
channel_grouping(x.source, x.medium, x.campaign) | |
from unnest(arr) x | |
where x.ignore_referrer is null | |
order by x.event_timestamp asc limit 1 | |
)); | |
with | |
-- this in-between table is based mapping table described in this article (you can leave out this part if you want to group on device-id only) | |
-- https://stacktonic.com/article/create-a-user-mapping-table-based-on-the-google-analytics-4-big-query-dataset) | |
-- only select ga_client_id (`user_pseudo_id`) with 3 of less customer-ids attached. | |
-- when multiple customer-ids, select most recent one. | |
customer_id_mapping as ( | |
select | |
ga_client_id, | |
customer_ids, | |
(select x.customer_id from unnest(customer_ids) x order by x.timestamp desc limit 1) as customer_id | |
from ( | |
select | |
ga_id.id as ga_client_id, | |
count(customer_id) as customer_ids_count, | |
array_agg(struct(customer_id, ga_id.timestamp)) as customer_ids, | |
from | |
`<your-project>.<your-dataset>.<your-mapping-table>`, | |
unnest(ga_client_id) as ga_id | |
group by | |
ga_client_id | |
) | |
where | |
customer_ids_count <= 3 | |
), | |
-- aggegrate event level data to session level data and attach customer-id from mapping table | |
sessions as ( | |
select | |
user_pseudo_id as ga_client_id, | |
max(m.customer_id) as customer_id, -- logic to select only one customer-id per user_pseudo_id can be found in `customer_id_mapping` | |
concat(user_pseudo_id,'.',(select cast(value.int_value as string) from unnest(event_params) where key = 'ga_session_id')) as session_id, -- combine user_pseudo_id and session_id for a unique session-id | |
timestamp_micros(min(event_timestamp)) as session_start, | |
max((select value.string_value from unnest(event_params) where key = 'session_engaged')) as session_engaged, | |
max((select value.int_value from unnest(event_params) where key = 'engagement_time_msec'))/1000 as engagement_time, | |
max(lower(traffic_source.source)) as traffic_source_first, | |
max(lower(traffic_source.medium)) as traffic_medium_first, | |
min(timestamp_micros(user_first_touch_timestamp)) as session_start_first_touch, | |
max(device.category) as device, | |
max(device.operating_system) as os, | |
max(device.web_info.browser) as browser, | |
max(geo.country) as country, | |
max(geo.city) as city, | |
ifnull(first_channel_session( | |
array_agg( | |
if(event_name in('page_view','user_engagement','scroll'), struct( | |
event_timestamp, | |
lower((select value.string_value from unnest(event_params) where key = 'source')) as source, | |
lower((select value.string_value from unnest(event_params) where key = 'medium')) as medium, | |
lower((select value.string_value from unnest(event_params) where key = 'campaign')) as campaign, | |
(select value.int_value from unnest(event_params) where key = 'entrances') as is_entrance, | |
(select value.int_value from unnest(event_params) where key = 'ignore_referrer') as ignore_referrer | |
), null) | |
ignore nulls order by event_timestamp asc) | |
), 'direct') as first_channel_in_session, | |
--countif(event_name = '<name-of-some-other-conversion-event>') as some_conversion, | |
if(countif(event_name = 'login') > 0,1,0) as session_login, | |
countif(event_name = 'purchase') as transactions, | |
sum(ecommerce.purchase_revenue) as transaction_value, | |
array_agg( | |
if(event_name = 'view_item' or event_name = 'add_to_cart', struct( | |
event_timestamp, | |
event_name, | |
(select item_id from unnest(items) limit 1) as item_id, | |
(select item_category from unnest(items) limit 1) as item_category | |
), null) | |
ignore nulls order by event_timestamp desc limit 100) as item_interactions | |
from | |
`<your-project>.analytics_<your-ga4-property-id>.events_*` as s | |
left join customer_id_mapping as m | |
on m.ga_client_id = s.user_pseudo_id | |
where | |
_table_suffix between format_date('%Y%m%d',date_sub(current_date(), interval lookback_window day)) | |
and format_date('%Y%m%d',date_sub(current_date(), interval 1 day)) | |
group by | |
user_pseudo_id, | |
session_id | |
) | |
-- aggegrate to user level | |
select | |
* except (item_interactions), | |
array_to_string( | |
array(select x.item_id from unnest(item_interactions) x where x.event_name = 'view_item' group by x.item_id order by max(x.event_timestamp) desc limit 5), | |
'|' | |
) as last_viewed_items, | |
array_to_string( | |
array(select x.item_id from unnest(item_interactions) x where x.event_name = 'view_item' group by x.item_id order by count(1) desc limit 5), | |
'|' | |
) as most_viewed_items, | |
array_to_string( | |
array(select x.item_category from unnest(item_interactions) x where x.event_name = 'view_item' group by x.item_category order by count(1) desc limit 5), | |
'|' | |
) as most_viewed_categories | |
from ( | |
select | |
--ga_client_id, -- uncomment when grouping on device-id instead of customer-id | |
customer_id, -- remove when grouping on device-id | |
min(datetime(session_start_first_touch)) as first_touch, | |
min(datetime(session_start)) as first_session, | |
max(datetime(session_start)) as last_session, | |
max(datetime(if(transactions > 0, session_start, null))) as last_transaction, | |
count(session_id) as sessions, | |
countif(date_diff(current_date, datetime(session_start), day) between 0 and 7) as sessions_1d_7d, | |
countif(date_diff(current_date, datetime(session_start), day) between 8 and 30) as sessions_7d_30d, | |
countif(date_diff(current_date, datetime(session_start), day) between 31 and 90) as sessions_30d_90d, | |
countif(session_engaged = '1') as sessions_engaged, | |
safe_divide(sum(engagement_time), countif(session_engaged = '1')) as engagement_time_avg, | |
sum(session_login) as logins, | |
sum(transactions) as total_transactions, | |
ifnull(sum(transaction_value),0) as total_transaction_value, | |
count(distinct device) as devices, | |
count(distinct first_channel_in_session) as channels, | |
date_diff(current_date(),cast((max(session_start)) as date),day) as days_since_last_session, | |
date_diff(current_date(),cast((max(if(transactions > 0, session_start, null))) as date),day) as days_since_last_transaction, | |
string_agg(device order by session_start asc limit 1) as device_first, | |
string_agg(device order by session_start desc limit 1) as device_last, | |
most_freq(array_agg(device)) as device_most_freq, | |
string_agg(os order by session_start desc limit 1) as os_last, | |
most_freq(array_agg(os)) as os_most_freq, | |
string_agg(browser order by session_start desc limit 1) as browser_last, | |
most_freq(array_agg(browser)) as browser_most_freq, | |
channel_grouping(min(traffic_source_first), min(traffic_medium_first), null) as channel_acquisition, | |
string_agg(first_channel_in_session order by session_start asc limit 1) as channel_first, | |
string_agg(first_channel_in_session order by session_start desc limit 1) as channel_last, | |
most_freq(array_agg(first_channel_in_session ignore nulls)) as channel_most_freq, | |
array_concat_agg(item_interactions limit 200) as item_interactions | |
from | |
sessions | |
where -- remove when grouping on device-id | |
customer_id is not null -- remove when grouping on device-id | |
group by | |
customer_id -- remove when grouping on device-id | |
--ga_client_id --uncomment when grouping on device-id instead of customer-id | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment