This gist contains code samples for the blog post "Connected data: Using BigQuery to analyse user behaviour in response to webhooks".
Last active
December 31, 2020 14:29
-
-
Save lawrencejones/5850c75ecdcbb77492c9e37d11076643 to your computer and use it in GitHub Desktop.
Connected data: Using BigQuery to analyse user behaviour in response to webhooks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
extractIDs takes the webhook request payload, which is structured like so: | |
{ | |
"events": [ | |
{ "links": { "<resource-name>": "ID", ... } }, | |
..., | |
] | |
} | |
And extracts the unique set of resource IDs referenced by the webhook. | |
*/ | |
CREATE TEMP FUNCTION | |
extractIDs(json STRING) | |
RETURNS ARRAY<string> | |
LANGUAGE js AS """ | |
return [ | |
... new Set( | |
JSON.parse(json)["events"] | |
.map(e=>Object.values(e["links"])) | |
.flat() | |
) | |
]; | |
"""; | |
WITH | |
-- Expand each webhook into a row per resource ID from the request_body | |
-- payload. Use the Javascript extractIDs function to parse all the events, | |
-- then UNNEST the parsed array into separate rows. | |
webhook_resources AS ( | |
SELECT | |
id, | |
created_at, | |
resource_id | |
FROM ( | |
SELECT | |
id, | |
created_at, | |
extractIDs(request_body) AS resource_ids | |
FROM | |
webhooks ) AS webhook_resources | |
-- UNNEST transforms the array of resources IDs into a row per resource ID, | |
-- while the CROSS JOIN finds every pairing from the original webhook (1) to | |
-- every resource ID (n). | |
CROSS JOIN | |
UNNEST(resource_ids) AS resource_id | |
), | |
-- Join each HTTP request log onto webhook_resources. Set the | |
-- webhook_work_duration field to be the duration of the request, if we can | |
-- find a webhook that was sent within 1m either side of the request. | |
requests AS ( | |
SELECT | |
merchant_activity._log_timestamp AS timestamp, | |
merchant_activity.handler, | |
merchant_activity.resource_id, | |
merchant_activity.duration, | |
( | |
CASE webhook_resources.resource_id IS NULL | |
WHEN TRUE THEN 0.0 | |
ELSE merchant_activity.duration | |
END | |
) AS webhook_work_duration | |
FROM | |
merchant_activity | |
-- OUTER JOIN to ensure we produce a row for every merchant_activity entry, as | |
-- we'll want to understand the ratio of webhook work to other API activity, | |
-- which is best measured in duration spent serving each request. | |
LEFT OUTER JOIN | |
webhook_resources | |
-- Allow for matching against a request either side by 1m of our API request. | |
-- This permits for slippage, as the log has a timestamp set at the completion | |
-- of a request and some integrators may respond to webhooks inline, rather | |
-- than asynchronously. | |
ON | |
webhook_resources.resource_id=merchant_activity.resource_id | |
AND ABS(UNIX_SECONDS(_log_timestamp) - UNIX_SECONDS(created_at)) < 60 | |
AND merchant_activity.resource_id IS NOT NULL | |
AND merchant_activity._log_timestamp > '2020-10-29' | |
) | |
-- Present work ratio rolled-up to each minute of our results. Provide total | |
-- durations so we can perform additional roll-ups later. | |
SELECT | |
TIMESTAMP_TRUNC(timestamp, MINUTE) AS timestamp, | |
COUNT(*) AS count, | |
SUM(duration) AS duration, | |
SUM(webhook_work_duration) AS webhook_work_duration, | |
SUM(webhook_work_duration) / SUM(duration) AS work_ratio | |
FROM | |
requests | |
GROUP BY | |
1 | |
ORDER BY | |
1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment