No metrics in
Self Joins
- Cross Join example
- Inner Join example
joins can be inefficent - they can require moving lots of data around and increase the data process from n to n * n
Windowing Functions
-- Count the Number of Sessions Generated by Organic Searches | |
SELECT COUNT(totals.visits) AS visitCount | |
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910] | |
WHERE trafficSource.medium = "organic" |
SELECT COUNT(totals.bounces)/COUNT(totals.visits) AS bounceRate | |
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910] | |
-- from: http://www.lunametrics.com/blog/2016/02/11/querying-google-analytics-data-in-bigquery/ |
SELECT HOUR(SEC_TO_TIMESTAMP(visitStartTime)) AS sessionHour, | |
COUNT(totals.bounces)/COUNT(totals.visits) AS bounceRate | |
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910] | |
GROUP BY sessionHour | |
-- from: http://www.lunametrics.com/blog/2016/02/11/querying-google-analytics-data-in-bigquery/ |
-- Per-hour bounce rate calculation | |
SELECT HOUR(SEC_TO_TIMESTAMP(visitStartTime)) AS sessionHour, -- Extracts the hour from the unix timestamp | |
-- COUNT is an aggregate function and will only count non-NULL values | |
COUNT(totals.bounces)/COUNT(totals.visits) AS bounceRate -- Computes the bounce rate. Don't use AVERAGE! it ignores NULLs | |
-- Let's just use the one example table | |
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910] | |
GROUP BY sessionHour | |
-- from: http://www.lunametrics.com/blog/2016/02/11/querying-google-analytics-data-in-bigquery/ |
SELECT trafficSource.medium AS medium, | |
COUNT(totals.bounces)/COUNT(totals.visits) AS bounceRate | |
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910] | |
GROUP BY medium | |
-- from: http://www.lunametrics.com/blog/2016/02/11/querying-google-analytics-data-in-bigquery/ |
-- Mediums with the Most Sessions per Hour | |
SELECT trafficSource.medium AS medium, | |
HOUR(SEC_TO_TIMESTAMP(visitStartTime)) AS sessionHour, | |
COUNT(totals.visits) AS sessionCount, | |
COUNT(totals.bounces)/COUNT(totals.visits) AS bounceRate | |
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910] | |
GROUP BY medium, sessionHour | |
ORDER BY sessionHour, sessionCount DESC | |
-- from: http://www.lunametrics.com/blog/2016/02/11/querying-google-analytics-data-in-bigquery/ |
SELECT date, | |
SUM(totals.visits) AS sessions | |
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910] | |
GROUP BY date | |
--http://www.lunametrics.com/blog/2016/06/23/google-analytics-bigquery-export-schema/ |
SELECT date, | |
SUM(totals.visits) AS sessions | |
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910] | |
GROUP BY date | |
HAVING sessions > 70 | |
-- http://www.lunametrics.com/blog/2016/06/23/google-analytics-bigquery-export-schema/ |
SELECT date, | |
SUM(totals.visits) AS sessions | |
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910] | |
WHERE totals.transactions > 0 | |
GROUP BY date | |
-- http://www.lunametrics.com/blog/2016/06/23/google-analytics-bigquery-export-schema/ |
SELECT trafficSource.medium, | |
SUM(totals.newVisits) / SUM(totals.visits) AS percentNewSessions | |
FROM [google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910] | |
GROUP BY trafficSource.medium |
SELECT | |
CONCAT(trafficSource.source,"/",trafficSource.medium) AS sourceMedium, | |
newSessions / sessions AS percentNewSessions, | |
bounces / sessions AS bounceRate | |
FROM ( | |
SELECT | |
trafficSource.source, | |
trafficSource.medium, | |
SUM(totals.visits) AS sessions, | |
SUM(totals.newVisits) AS newSessions, | |
SUM(totals.bounces) AS bounces, | |
FROM | |
[google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910] | |
GROUP BY | |
trafficSource.source, | |
trafficSource.medium) |
SELECT | |
trafficSource.medium, | |
SUM(totals.visits) AS sessions, | |
-- 5 will give the min, 25%, 50%, 75%, max with 20% error | |
-- the more buckets, the better the approximation (error = 1/number of buckets) | |
-- at the cost of more computation | |
-- QUANTILES returns all of the buckets, use NTH to extract the bucket you want | |
NTH(2, QUANTILES(totals.timeOnSite, 5)) AS firstQuartile, | |
NTH(3, QUANTILES(totals.timeOnSite, 5)) AS mean, | |
NTH(3, QUANTILES(totals.timeOnSite, 5)) AS thirdQuartile | |
FROM ( | |
SELECT | |
trafficSource.medium, | |
totals.visits, | |
-- Sessions with a single page view will have no time on site reported | |
IF(totals.timeOnSite IS NULL, 0, totals.timeOnSite) AS totals.timeOnSite | |
FROM | |
[google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910]) | |
GROUP BY | |
trafficSource.medium |
-- of people who viewed one specific page (pageA) | |
-- which other pages (pageB) did they view and how many times (crossCount) ? | |
SELECT | |
sp as pageA, | |
pp as pageB, | |
COUNT(*) AS crossCount | |
FROM ( | |
SELECT | |
sess.fullvisitorid AS sfvi, | |
pages.fullVisitorid AS pfvi, | |
sess.hits.page.pagePath AS sp, | |
pages.hits.page.pagePath AS pp | |
FROM | |
FLATTEN([google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910], hits) AS sess | |
CROSS JOIN | |
FLATTEN([google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910], hits) AS pages | |
WHERE | |
-- ensures each combo is only counted a single time | |
pages.fullvisitorid < sess.fullvisitorid | |
AND pages.hits.page.pagePath < sess.hits.page.pagePath | |
GROUP BY sfvi, pfvi, sp, pp ) | |
GROUP BY pageA, pageB | |
ORDER BY pageA, crossCount DESC | |
-- from - http://www.lunametrics.com/blog/2016/05/12/self-joins-windowing-user-defined-functions-bigquery/ |
- find the the page viewed before a page, | |
- we could find all related pages in a session using a self-join, | |
- filter out, using a WHERE clause because in BigQuery join conditions, in the ON, cannot have inequalities, | |
SELECT | |
cur_hit.fullVisitorId AS fullVisitorId, | |
cur_hit.visitId AS visitID, | |
cur_hit.hits.hitNumber AS cur_hitnumber, | |
cur_hit.hits.page.pagePath as cur_pagePath, | |
cur_hit.hits.time AS cur_time, | |
MAX(prev_hit.hits.hitNumber) AS prev_hitNumber, | |
FROM | |
FLATTEN([google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910], hits) AS prev_hit | |
INNER JOIN | |
FLATTEN([google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910], hits) AS cur_hit | |
ON | |
prev_hit.fullvisitorid = cur_hit.fullvisitorid | |
AND prev_hit.visitid = cur_hit.visitid | |
WHERE | |
prev_hit.hits.hitNumber < cur_hit.hits.hitNumber | |
AND prev_hit.hits.type = "PAGE" | |
AND cur_hit.hits.type = "PAGE" | |
GROUP BY fullVisitorId, visitID, cur_hitnumber, cur_pagePath, cur_time | |
-- http://www.lunametrics.com/blog/2016/05/12/self-joins-windowing-user-defined-functions-bigquery/ |
-- using windowing to find time on page | |
-- COUNT(num) OVER (PARTITION BY num ) as Var I | |
-- different windowing functions can be used in the same query | |
SELECT | |
fullVisitorId, | |
visitId, | |
hits.hitNumber, | |
hits.page.pagePath, | |
-- get next and previous hit time to be able to work out length of each hit | |
hits.time AS hit_time, | |
LEAD(hits.time, 1) OVER (PARTITION BY fullVisitorId, visitId ORDER BY hits.time ASC) AS next_hit_time, | |
FROM | |
[google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910] | |
WHERE | |
hits.type = "PAGE" |
SELECT | |
fullVisitorId, | |
visitId, | |
hitNumber, | |
pagePath, | |
diff AS timeOnPage_ms | |
FROM | |
timePerPage( | |
SELECT | |
fullVisitorId, | |
visitId, | |
NEST(hits.page.pagePath) AS pages, | |
NEST(hits.type) AS types, | |
NEST(hits.time) AS times | |
FROM | |
[google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910] | |
GROUP BY | |
fullVisitorId, | |
visitId | |
LIMIT | |
10) |
SELECT | |
COUNT(*) as logins_after_product_view | |
FROM ( | |
SELECT | |
SUM(IF(hits.eventinfo.eventCategory = "Login", 1, 0)) OVER (PARTITION BY fullvisitorid, visitid ROWS BETWEEN 1 FOLLOWING AND 3 FOLLOWING) AS login_count, | |
hits.page.pagePath as pagePath | |
FROM | |
[google.com:analytics-bigquery:LondonCycleHelmet.ga_sessions_20130910] ) | |
WHERE | |
login_count > 0 | |
AND (pagePath CONTAINS 'vests' OR pagePath CONTAINS 'helments') | |
-- from: http://www.lunametrics.com/blog/2016/02/11/querying-google-analytics-data-in-bigquery/ |