Last active
June 12, 2024 13:12
-
-
Save krisjan-oldekamp/0a1a1c0085e6e6f68306f184ed1af4a1 to your computer and use it in GitHub Desktop.
How to create a user mapping table (or Identity Graph) based on all the available user identifiers in the Google Analytics 4 BigQuery exports (like device-IDs or customer-IDs). Full article on stacktonic.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Author: Krisjan Oldekamp | |
-- https://stacktonic.com/article/create-a-user-mapping-table-based-on-the-google-analytics-4-big-query-dataset | |
declare lookback_window int64 default 90; -- how many days to lookback into the dataset to search for ids (compared to today) | |
-- udf: deduplicate array of struct | |
create temp function dedup(arr any type) as (( | |
select | |
array_agg(t) | |
from ( | |
select max(a.timestamp) as timestamp, a.id from unnest(arr) a group by a.id order by timestamp desc limit 100 | |
) t | |
)); | |
with ga_user_ids as ( | |
-- select all user-ids from the ga4 dataset in the specified time period | |
select | |
* | |
from ( | |
select | |
user_pseudo_id as ga_client_id, -- device-id | |
user_id as customer_id, -- custom defined user-id (e.g. customer-id) | |
(select value.string_value from unnest(user_properties) where key = 'relay_id') as customer_id_secondary, -- secondary custom user-id | |
(select value.string_value from unnest(event_params) where event_name = 'page_view' and key = 'gclid') as gclid, -- click-id | |
ecommerce.transaction_id as transaction_id, -- order-ids | |
max(event_timestamp) as event_timestamp -- timestamps | |
from `<your-project>.analytics_<your-dataset>.events_*` | |
where | |
_table_suffix between format_date('%Y%m%d',date_sub(current_date(), interval lookback_window day)) | |
and format_date('%Y%m%d',date_sub(current_date(), interval 1 day)) | |
group by | |
1,2,3,4,5 | |
) | |
-- filter out users without any additional user-ids besides ga_client_id | |
where | |
customer_id is not null | |
or customer_id_secondary is not null | |
or gclid is not null | |
or transaction_id is not null | |
) | |
-- select all identifiers and group on customer-id. | |
select | |
customer_id, | |
max(timestamp_micros(event_timestamp)) as timestamp_last_visit, | |
-- aggegrate customer-ids to an array of structs | |
dedup(array_agg( | |
if(ga_client_id is not null, | |
struct(timestamp_micros(event_timestamp) as timestamp, ga_client_id as id), | |
null | |
) ignore nulls order by event_timestamp desc | |
)) as ga_client_id, | |
-- aggegrate secondary customer-ids to an array of structs | |
dedup(array_agg( | |
if(customer_id_secondary is not null, | |
struct(timestamp_micros(event_timestamp) as timestamp, customer_id_secondary as id), | |
null | |
)ignore nulls order by event_timestamp desc | |
)) as customer_id_secondary, | |
-- aggegrate gclids to an array of structs | |
dedup(array_agg( | |
if(gclid is not null, | |
struct(timestamp_micros(event_timestamp) as timestamp, gclid as id), | |
null | |
) ignore nulls order by event_timestamp desc | |
)) as gclid, | |
-- aggegrate transaction-ids to an array of structs | |
dedup(array_agg( | |
if(transaction_id is not null, | |
struct(timestamp_micros(event_timestamp) as timestamp, transaction_id as id), | |
null | |
) ignore nulls order by event_timestamp desc | |
)) as transaction_id, | |
from ga_user_ids | |
where | |
customer_id is not null | |
group by 1 |
I think the last few lines should read:
FROM ga_user_ids WHERE customer_id IS NOT NULL GROUP BY customer_id
You're right, last minute changes... I've updated the snippet. Thanks!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I think the last few lines should read:
FROM
ga_user_ids
WHERE customer_id IS NOT NULL
GROUP BY
customer_id