Created
October 9, 2024 16:43
-
-
Save nicobrx/b971c90a33dc135d167f19729a3947b6 to your computer and use it in GitHub Desktop.
Looker Studio custom query to extract path data from GA4 export data in BigQuery
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- This is intended to be run as a Looker Studio BigQuery custom query. To run it directly in BigQuery, change @DS_START_DATE | |
-- and @DS_END_DATE to 'YYYYMMDD' string values. To use it in BigQuery, you will also need to enable date range parameters | |
-- in the custom query. Be wary of using this query for high-volume GA4 properties or long date ranges, as it could incur | |
-- BigQuery analysis costs. This github repo has a Google Dataform repository that is designed to update a table with path data | |
-- on a daily basis, which is a much more efficient and cost-effective approach: https://github.com/nicobrx/ga4-path-analysis | |
-- This query references an 'extract_path' UDF, the original version of which can be found here: | |
-- https://github.com/GoogleCloudPlatform/bigquery-utils/blob/master/udfs/community/cw_url_extract_path.sqlx | |
-- You will need to update the reference in this query to the location of the UDF you create in your BigQuery project | |
-- You will also need to update the FROM references in the source_mediums_prep and ranked_path subqueries below to match the location | |
-- of your GA4 export table | |
WITH | |
dates AS ( -- change the start and end date for all queries here | |
SELECT | |
@DS_START_DATE AS start_date, | |
@DS_END_DATE AS end_date, | |
), | |
-- get source / medium values by session_id | |
source_mediums_prep AS ( | |
SELECT | |
CONCAT(user_pseudo_id,(SELECT value.int_value FROM UNNEST(event_params) WHERE key = 'ga_session_id')) AS session_id, | |
COALESCE(ARRAY_AGG((SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'source') IGNORE NULLS | |
ORDER BY event_timestamp)[SAFE_OFFSET(0)],'(direct)') AS session_source, | |
COALESCE(ARRAY_AGG((SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'medium') IGNORE NULLS | |
ORDER BY event_timestamp)[SAFE_OFFSET(0)],'(none)') AS session_medium, | |
ARRAY_AGG((SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'page_location') IGNORE NULLS | |
ORDER BY event_timestamp)[SAFE_OFFSET(0)] LIKE '%gclid%' AS has_gclid | |
FROM | |
`your_bigquery.analytics_12345678.events_*` | |
WHERE _table_suffix BETWEEN (SELECT start_date FROM dates) AND (SELECT end_date FROM dates) | |
GROUP BY session_id ), | |
-- if a gclid is present, assign it to google / cpc | |
source_mediums AS ( | |
SELECT | |
session_id, | |
CASE | |
WHEN has_gclid IS TRUE THEN 'google / cpc' | |
ELSE CONCAT(IFNULL(session_source,'(direct)'),' / ',IFNULL(session_medium,'(none)')) | |
END AS session_source_medium, | |
FROM source_mediums_prep ), | |
-- get session path sequences and assign each step a sequence number (seq_num) | |
ranked_paths AS ( | |
SELECT | |
PARSE_DATE('%Y%m%d', event_date) AS date, | |
CONCAT(user_pseudo_id,(SELECT value.int_value FROM UNNEST(event_params)WHERE key = 'ga_session_id')) AS session_id, | |
event_timestamp, | |
(CASE | |
WHEN event_name = 'page_view' THEN `twooctobersbigquery.udf_library.extract_path`(( SELECT value.string_value FROM UNNEST(event_params) WHERE KEY = 'page_location')) | |
ELSE REPLACE(event_name,"_"," ") | |
END) AS page_path, | |
ROW_NUMBER() OVER(PARTITION BY (CONCAT(user_pseudo_id,(SELECT value.int_value FROM UNNEST(event_params) | |
WHERE key = 'ga_session_id'))) | |
ORDER BY | |
event_timestamp) AS seq_num | |
FROM | |
`your_bigquery.analytics_12345678.events_*` events | |
WHERE | |
_table_suffix BETWEEN (SELECT start_date FROM dates) AND (SELECT end_date FROM dates) | |
-- update the following list if you want to see specific events included in path behavior | |
-- contact_us_submission is included here as an example - the event names have to match | |
-- event names in your GA4 property | |
AND event_name IN ('page_view','contact_us_submission') | |
ORDER BY session_id, event_timestamp ), | |
-- join sequenced event data and source / medium values | |
paths_w_sources AS (SELECT | |
source_mediums.session_source_medium, | |
ranked_paths.* | |
FROM ranked_paths | |
JOIN source_mediums | |
ON ranked_paths.session_id = source_mediums.session_id), | |
-- add the previous page - this will be used for deduping in the next step | |
filtered_paths AS ( | |
SELECT | |
date, | |
session_id, | |
event_timestamp, | |
session_source_medium, | |
page_path, | |
seq_num, | |
LAG(page_path) OVER(PARTITION BY session_id ORDER BY seq_num) as prev_page_path | |
FROM | |
paths_w_sources | |
), | |
-- remove instances where a user visits the same page more than once sequentially | |
deduped_paths AS ( | |
SELECT | |
date, | |
session_id, | |
event_timestamp, | |
session_source_medium, | |
page_path | |
FROM | |
filtered_paths | |
WHERE | |
page_path != prev_page_path OR prev_page_path IS NULL | |
), | |
-- convert sequential page views per session into an array - results in one row per session | |
paths_array AS ( | |
SELECT | |
MIN(date) as date, | |
session_id, | |
session_source_medium, | |
ARRAY_AGG(page_path ORDER BY event_timestamp ASC) AS path_steps | |
FROM | |
deduped_paths | |
GROUP BY | |
session_id, session_source_medium | |
) | |
-- unpack sequential views into individual columns | |
SELECT | |
date, | |
session_id, | |
session_source_medium, | |
path_steps[OFFSET(0)] AS step_1, | |
path_steps[SAFE_OFFSET(1)] AS step_2, | |
path_steps[SAFE_OFFSET(2)] AS step_3, | |
path_steps[SAFE_OFFSET(3)] AS step_4, | |
path_steps[SAFE_OFFSET(4)] AS step_5, | |
path_steps[SAFE_OFFSET(5)] AS step_6, | |
path_steps[SAFE_OFFSET(6)] AS step_7, | |
path_steps[SAFE_OFFSET(7)] AS step_8, | |
path_steps[SAFE_OFFSET(8)] AS step_9, | |
path_steps[SAFE_OFFSET(9)] AS step_10, | |
-- this is to make it possible to search for paths with a given step in Looker Studio | |
ARRAY_TO_STRING(path_steps," ") AS all_steps, | |
ARRAY_LENGTH(path_steps) AS number_of_steps | |
FROM paths_array |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment