Created
June 27, 2023 12:46
-
-
Save vincentkoc/14bde9a0efef73c2d6d3b72d70a28ba3 to your computer and use it in GitHub Desktop.
Custom GA4 Attribution Model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
GA4 Attribution Model | |
This query is designed to replicate the old Universal Analytics attribution | |
model in Google Analytics 4 (GA4). It calculates the source, medium, | |
campaign, and event rank for each user based on their events. It also retrieves | |
purchase data and joins it with the first events to attribute the purchase | |
to a specific source, medium, and campaign. | |
Assumptions: | |
- The query is run within a GA4 project. | |
- The event_timestamp field represents the timestamp of each event. | |
- The event_params field contains a list of parameters associated with each event. | |
- The collected_traffic_source fields represent manually collected traffic source information. | |
- The traffic_source fields represent traffic source information captured by GA4. | |
- The first_events CTE assigns a rank to events based on priority and event_bundle_sequence_id. | |
- The purchases CTE retrieves purchase events and their associated transaction IDs. | |
- The deduped_first_events CTE selects the first event for each user and removes duplicates. | |
Things to watch out for: | |
- Verify that the event_timestamp, event_params, collected_traffic_source, and traffic_source fields are correctly defined in the GA4 schema. | |
- Check if the priorities assigned in the CASE statement in the events CTE accurately represent your attribution model. | |
- Adjust the WHERE clause in the events CTE to filter events based on your desired time range. | |
- Confirm that the event_name 'purchase' and event_param 'transaction_id' match your GA4 event schema. | |
- Modify the transaction_id values in the last WHERE clause to match the desired transactions you want to analyze. | |
- Uncomment the SELECT statement at the end to include all columns in the output. | |
Further Reading: | |
- Google official schema for GA4 in BigQuery: https://support.google.com/analytics/answer/7029846?hl=en | |
- GA vs GA4 Attribution using BQ: https://tanelytics.com/ga4-bigquery-session-traffic_source/ | |
- Capturing traffic data in events for BQ: https://gtm-gear.com/posts/ga4-sessions-source-medium/ | |
*/ | |
WITH | |
-- CTE to NULLIFY direct traffic so we can properly sort/rank channel data | |
nullified_direct AS ( | |
SELECT | |
user_pseudo_id, | |
event_date, | |
event_timestamp, | |
event_bundle_sequence_id, | |
collected_traffic_source.manual_source, | |
collected_traffic_source.manual_medium, | |
collected_traffic_source.manual_campaign_name, | |
CASE WHEN traffic_source.source IN ('(direct)', NULL) THEN NULL ELSE traffic_source.source END as source, | |
CASE WHEN traffic_source.source IN ('(direct)', NULL) THEN NULL ELSE traffic_source.medium END as medium, | |
CASE WHEN traffic_source.source IN ('(direct)', NULL) THEN NULL ELSE traffic_source.name END as name, | |
device, | |
event_params | |
FROM | |
`<your-project-id>.analytics_<your-table-id>.events_*`, | |
WHERE _TABLE_SUFFIX >= FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)) | |
), | |
-- CTE to select relevant event data and determine source, medium, and campaign | |
processed_events AS ( | |
SELECT | |
user_pseudo_id, | |
event_date, | |
event_timestamp, | |
event_bundle_sequence_id, | |
COALESCE(source, manual_source) as source, | |
COALESCE(medium, manual_medium) as medium, | |
COALESCE(manual_campaign_name, name) AS campaign, | |
device.category as device_category, | |
COALESCE(device.web_info.browser,device.browser) as browser, | |
COALESCE(device.web_info.browser_version,device.browser_version) as browser_version, | |
(SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'page_location' LIMIT 1) AS landing_page, | |
(SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'ad_group' LIMIT 1) as ad_group, | |
(SELECT value.string_value FROM UNNEST(event_params) WHERE key = 'keyword' LIMIT 1) as keyword, | |
(SELECT IFNULL(value.string_value, NULL) FROM UNNEST(event_params) WHERE key = 'gclid' LIMIT 1) AS gclid, | |
EXISTS(SELECT 1 FROM UNNEST(event_params) WHERE key IN ('gclid', 'gbraid', 'wbraid')) as is_adwords | |
FROM | |
nullified_direct | |
), | |
-- Here we separate out the logic for determining the source and medium based on the processed events. | |
events AS ( | |
SELECT | |
*, | |
CASE | |
WHEN is_adwords THEN 'google' | |
WHEN source IS NULL OR source = '0' THEN '(direct)' | |
ELSE source | |
END AS final_source, | |
CASE | |
WHEN is_adwords THEN 'cpc' | |
WHEN source IS NULL OR source = '0' THEN '(none)' | |
ELSE medium | |
END AS final_medium, | |
CASE | |
WHEN medium IN ('cpc', 'paidsearch') THEN 1 | |
WHEN medium IN ('organic') THEN 2 | |
WHEN medium IN ('social') THEN 3 | |
WHEN medium IN ('edm') THEN 4 | |
WHEN medium IN ('email') THEN 5 | |
WHEN medium IN ('display') THEN 6 | |
WHEN medium IN ('affiliates') THEN 7 | |
WHEN medium NOT IN ('transactional-sms') THEN 8 | |
WHEN medium NOT IN ('push') THEN 9 | |
ELSE 10 | |
END AS priority | |
FROM | |
processed_events | |
), | |
-- CTE to retrieve purchase data and transaction ID | |
purchases AS ( | |
SELECT | |
event_date, | |
event_timestamp, | |
user_pseudo_id, | |
MAX(IF(event_param.key = 'transaction_id', event_param.value.string_value, NULL)) AS transaction_id | |
FROM | |
`<your-project-id>.analytics_<your-table-id>.events_*`, | |
UNNEST(event_params) AS event_param | |
WHERE event_name = 'purchase' | |
AND _TABLE_SUFFIX >= FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)) | |
GROUP BY event_date, event_timestamp, user_pseudo_id | |
), | |
-- CTE to retrieve the first event for each user_pseudo_id based on the minimum event timestamp for landing page data | |
first_landing_page_events AS ( | |
SELECT | |
-- join and rank keys | |
user_pseudo_id, | |
event_timestamp, | |
event_bundle_sequence_id, | |
event_date, | |
-- specific data | |
landing_page, | |
device_category, | |
browser, | |
browser_version, | |
ROW_NUMBER() OVER (PARTITION BY user_pseudo_id ORDER BY event_timestamp, event_bundle_sequence_id) AS landing_page_rank | |
FROM events | |
), | |
-- CTE to retrieve the first non-null channel event for each user_pseudo_id | |
first_channel_events AS ( | |
SELECT | |
-- join and rank keys | |
user_pseudo_id, | |
event_timestamp, | |
event_bundle_sequence_id, | |
event_date, | |
-- specific data | |
gclid, | |
COALESCE(source, '(direct)') AS source, | |
COALESCE(medium, '(none)') AS medium, | |
campaign, | |
ad_group, | |
keyword, | |
ROW_NUMBER() OVER (PARTITION BY user_pseudo_id ORDER BY event_timestamp, event_bundle_sequence_id, priority) AS channel_rank | |
FROM events | |
WHERE source IS NOT NULL | |
), | |
-- CTE to retrieve deduplicated first event data | |
-- we have cases were we have competing channel data for one purchase | |
-- give 100% credit based on priorty from event_rank | |
-- CTE to retrieve deduplicated first event data | |
deduped_first_events AS ( | |
SELECT | |
-- Grab user level data from initial session start | |
fpe.user_pseudo_id, | |
fpe.event_timestamp, | |
fpe.event_bundle_sequence_id, | |
fpe.device_category, | |
fpe.landing_page, | |
fpe.browser, | |
fpe.browser_version, | |
-- Grab channel based on channel ranking | |
fce.gclid, | |
COALESCE(source, '(direct)') AS source, | |
COALESCE(medium, '(none)') AS medium, | |
fce.campaign, | |
fce.ad_group, | |
fce.keyword, | |
FROM first_landing_page_events fpe | |
LEFT JOIN first_channel_events fce | |
ON fpe.user_pseudo_id = fce.user_pseudo_id AND fce.channel_rank = 1 | |
WHERE fpe.landing_page_rank = 1 | |
) | |
-- SELECT statement to retrieve attribution data | |
SELECT | |
p.event_date, | |
transaction_id, | |
CASE | |
WHEN dfe.source IS NULL AND dfe.medium = 'referral' THEN '(direct)' | |
ELSE dfe.source | |
END AS source, | |
CASE | |
WHEN dfe.source IS NULL AND dfe.medium = 'referral' THEN '(none)' | |
ELSE dfe.medium | |
END AS medium, | |
dfe.campaign, | |
p.user_pseudo_id, | |
dfe.gclid, | |
dfe.device_category, | |
dfe.browser, | |
dfe.browser_version, | |
dfe.ad_group, | |
dfe.keyword, | |
dfe.landing_page | |
FROM purchases p | |
LEFT JOIN deduped_first_events dfe | |
ON p.user_pseudo_id = dfe.user_pseudo_id | |
ORDER BY p.event_date, p.transaction_id |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment