Created
July 28, 2022 05:53
-
-
Save khunreus/eeeaa966c7983abfe62760de77cf7488 to your computer and use it in GitHub Desktop.
The query pulls page_view event data from the Google BigQuery public dataset for ecommerce (GA4). The implementation goes around using traditional UNNEST approach as it can cause missing rows when a CROSS JOIN is applied to a table with NULL values.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- the query pulls page_view event data from the Google BigQuery public dataset for ecommerce (GA4) | |
-- the implementation with array aggregation is addressing an issue raised here https://stackoverflow.com/questions/44918108/google-bigquery-i-lost-null-row-when-using-unnest-function | |
-- stackoverflow thread is exploring a different solution | |
WITH base_table AS ( | |
-- pulls relevant columns from relevant dates to decrease the size of data scanned | |
SELECT | |
event_name, | |
event_date, | |
event_timestamp, | |
user_pseudo_id, | |
user_id, | |
device, | |
geo, | |
traffic_source, | |
event_params, | |
user_properties | |
FROM | |
`bigquery-public-data.ga4_obfuscated_sample_ecommerce.events_*` | |
WHERE | |
_table_suffix >= '20210101' | |
AND event_name IN ( | |
'page_view' | |
) | |
) | |
, unnested_events AS ( | |
-- unnests event parameters to get to relevant keys and values | |
SELECT | |
event_date AS date, | |
event_timestamp AS event_timestamp_microseconds, | |
user_pseudo_id, | |
ARRAY_AGG(DISTINCT ARRAY( | |
SELECT DISTINCT | |
c.value.int_value AS ga_session_id | |
FROM | |
UNNEST(event_params) c | |
WHERE c.key = 'ga_session_id' | |
AND c.value.int_value is NOT NULL | |
GROUP BY 1)[SAFE_OFFSET(0)] | |
IGNORE NULLS)[SAFE_OFFSET(0)] visitID, | |
ARRAY_AGG(DISTINCT ARRAY( | |
SELECT DISTINCT | |
c.value.int_value AS ga_session_number | |
FROM | |
UNNEST(event_params) c | |
WHERE c.key = 'ga_session_number' | |
AND c.value.int_value is NOT NULL | |
GROUP BY 1)[SAFE_OFFSET(0)] | |
IGNORE NULLS)[SAFE_OFFSET(0)] visitNumber | |
,ARRAY_AGG(DISTINCT ARRAY( | |
SELECT DISTINCT | |
c.value.string_value AS page_title | |
FROM | |
UNNEST(event_params) c | |
WHERE c.key = 'page_title' | |
GROUP BY 1)[SAFE_OFFSET(0)] | |
IGNORE NULLS)[SAFE_OFFSET(0)] page_title | |
FROM | |
base_table | |
GROUP BY 1,2,3 | |
) | |
, ranked_screens AS ( | |
-- prepares additional data points for analytics to understand transitions between the previous, current and following pages | |
SELECT | |
*, | |
DENSE_RANK() OVER (PARTITION BY user_pseudo_id, visitID ORDER BY event_timestamp_microseconds ASC) page_rank, | |
LAG(page_title,1) OVER (PARTITION BY user_pseudo_id, visitID ORDER BY event_timestamp_microseconds ASC) previous_page, | |
LEAD(page_title,1) OVER (PARTITION BY user_pseudo_id, visitID ORDER BY event_timestamp_microseconds ASC) next_page | |
FROM | |
unnested_events | |
--WHERE | |
-- length(trim(page_title)) > 0 | |
) | |
, screen_summary AS ( | |
-- another layer of analytics: check the last page number viewed on a session | |
-- aggregate all screens per session, which will be helpful in identifying power users | |
SELECT | |
*, | |
MAX(page_rank) OVER (PARTITION BY user_pseudo_id, visitID) last_page_rank, | |
ARRAY_AGG(page_title) OVER (PARTITION BY user_pseudo_id, visitID) pages_on_a_visit | |
FROM | |
ranked_screens | |
) | |
, screen_summary_agg AS ( | |
-- cleans up pages_on_a_visit field | |
SELECT * EXCEPT( | |
pages_on_a_visit | |
), | |
ARRAY_TO_STRING( | |
ARRAY( | |
SELECT DISTINCT | |
* | |
FROM | |
UNNEST(pages_on_a_visit) | |
ORDER BY 1 ASC | |
) | |
, '||') AS screens_on_a_visit | |
FROM | |
screen_summary | |
) | |
SELECT | |
-- final analytical point: the first time a page is viewed in a session | |
*, | |
MIN(page_rank) OVER (PARTITION BY user_pseudo_id, visitID, page_title) AS page_rank_session_first_seen | |
FROM | |
screen_summary_agg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment