Skip to content

Instantly share code, notes, and snippets.

@nicobrx
Created October 9, 2024 16:43
Show Gist options
  • Save nicobrx/b971c90a33dc135d167f19729a3947b6 to your computer and use it in GitHub Desktop.
Save nicobrx/b971c90a33dc135d167f19729a3947b6 to your computer and use it in GitHub Desktop.
Looker Studio custom query to extract path data from GA4 export data in BigQuery
-- This is intended to be run as a Looker Studio BigQuery custom query. To run it directly in BigQuery, change @DS_START_DATE
-- and @DS_END_DATE to 'YYYYMMDD' string values. To use it in BigQuery, you will also need to enable date range parameters
-- in the custom query. Be wary of using this query for high-volume GA4 properties or long date ranges, as it could incur
-- BigQuery analysis costs. This github repo has a Google Dataform repository that is designed to update a table with path data
-- on a daily basis, which is a much more efficient and cost-effective approach: https://github.com/nicobrx/ga4-path-analysis
-- This query references an 'extract_path' UDF, the original version of which can be found here:
-- https://github.com/GoogleCloudPlatform/bigquery-utils/blob/master/udfs/community/cw_url_extract_path.sqlx
-- You will need to update the reference in this query to the location of the UDF you create in your BigQuery project
-- You will also need to update the FROM references in the source_mediums_prep and ranked_path subqueries below to match the location
-- of your GA4 export table
WITH
dates AS ( -- change the start and end date for all queries here
SELECT
@DS_START_DATE AS start_date,
@DS_END_DATE AS end_date,
),
-- get source / medium values by session_id
source_mediums_prep AS (
SELECT
CONCAT(user_pseudo_id,(SELECT value.int_value FROM UNNEST(event_params) WHERE key = 'ga_session_id')) AS session_id,
COALESCE(ARRAY_AGG((SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'source') IGNORE NULLS
ORDER BY event_timestamp)[SAFE_OFFSET(0)],'(direct)') AS session_source,
COALESCE(ARRAY_AGG((SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'medium') IGNORE NULLS
ORDER BY event_timestamp)[SAFE_OFFSET(0)],'(none)') AS session_medium,
ARRAY_AGG((SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'page_location') IGNORE NULLS
ORDER BY event_timestamp)[SAFE_OFFSET(0)] LIKE '%gclid%' AS has_gclid
FROM
`your_bigquery.analytics_12345678.events_*`
WHERE _table_suffix BETWEEN (SELECT start_date FROM dates) AND (SELECT end_date FROM dates)
GROUP BY session_id ),
-- if a gclid is present, assign it to google / cpc
source_mediums AS (
SELECT
session_id,
CASE
WHEN has_gclid IS TRUE THEN 'google / cpc'
ELSE CONCAT(IFNULL(session_source,'(direct)'),' / ',IFNULL(session_medium,'(none)'))
END AS session_source_medium,
FROM source_mediums_prep ),
-- get session path sequences and assign each step a sequence number (seq_num)
ranked_paths AS (
SELECT
PARSE_DATE('%Y%m%d', event_date) AS date,
CONCAT(user_pseudo_id,(SELECT value.int_value FROM UNNEST(event_params)WHERE key = 'ga_session_id')) AS session_id,
event_timestamp,
(CASE
WHEN event_name = 'page_view' THEN `twooctobersbigquery.udf_library.extract_path`(( SELECT value.string_value FROM UNNEST(event_params) WHERE KEY = 'page_location'))
ELSE REPLACE(event_name,"_"," ")
END) AS page_path,
ROW_NUMBER() OVER(PARTITION BY (CONCAT(user_pseudo_id,(SELECT value.int_value FROM UNNEST(event_params)
WHERE key = 'ga_session_id')))
ORDER BY
event_timestamp) AS seq_num
FROM
`your_bigquery.analytics_12345678.events_*` events
WHERE
_table_suffix BETWEEN (SELECT start_date FROM dates) AND (SELECT end_date FROM dates)
-- update the following list if you want to see specific events included in path behavior
-- contact_us_submission is included here as an example - the event names have to match
-- event names in your GA4 property
AND event_name IN ('page_view','contact_us_submission')
ORDER BY session_id, event_timestamp ),
-- join sequenced event data and source / medium values
paths_w_sources AS (SELECT
source_mediums.session_source_medium,
ranked_paths.*
FROM ranked_paths
JOIN source_mediums
ON ranked_paths.session_id = source_mediums.session_id),
-- add the previous page - this will be used for deduping in the next step
filtered_paths AS (
SELECT
date,
session_id,
event_timestamp,
session_source_medium,
page_path,
seq_num,
LAG(page_path) OVER(PARTITION BY session_id ORDER BY seq_num) as prev_page_path
FROM
paths_w_sources
),
-- remove instances where a user visits the same page more than once sequentially
deduped_paths AS (
SELECT
date,
session_id,
event_timestamp,
session_source_medium,
page_path
FROM
filtered_paths
WHERE
page_path != prev_page_path OR prev_page_path IS NULL
),
-- convert sequential page views per session into an array - results in one row per session
paths_array AS (
SELECT
MIN(date) as date,
session_id,
session_source_medium,
ARRAY_AGG(page_path ORDER BY event_timestamp ASC) AS path_steps
FROM
deduped_paths
GROUP BY
session_id, session_source_medium
)
-- unpack sequential views into individual columns
SELECT
date,
session_id,
session_source_medium,
path_steps[OFFSET(0)] AS step_1,
path_steps[SAFE_OFFSET(1)] AS step_2,
path_steps[SAFE_OFFSET(2)] AS step_3,
path_steps[SAFE_OFFSET(3)] AS step_4,
path_steps[SAFE_OFFSET(4)] AS step_5,
path_steps[SAFE_OFFSET(5)] AS step_6,
path_steps[SAFE_OFFSET(6)] AS step_7,
path_steps[SAFE_OFFSET(7)] AS step_8,
path_steps[SAFE_OFFSET(8)] AS step_9,
path_steps[SAFE_OFFSET(9)] AS step_10,
-- this is to make it possible to search for paths with a given step in Looker Studio
ARRAY_TO_STRING(path_steps," ") AS all_steps,
ARRAY_LENGTH(path_steps) AS number_of_steps
FROM paths_array
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment