Skip to content

Instantly share code, notes, and snippets.

@pliniosilveira
Created May 30, 2023 17:54
Show Gist options
  • Save pliniosilveira/960cd8e3c2c0d955f27a464e94551d12 to your computer and use it in GitHub Desktop.
Save pliniosilveira/960cd8e3c2c0d955f27a464e94551d12 to your computer and use it in GitHub Desktop.
Testing new DG build -- just visits ingestion step
DECLARE DIRTY_DATES_UTC ARRAY<DATE>;
DECLARE MIN_VISIT_DATE DATE;
DECLARE LATEST_BUILD TIMESTAMP;
-- STEP 1 check new data (dirty partitions)
-- SET DIRTY_DATES_UTC = ARRAY(
-- WITH
-- src AS (
-- SELECT PARSE_DATE('%Y%m%d', partition_id) pdate, last_modified_time src_mod
-- FROM `cptsrewards-hrd.im_madhive_vs.INFORMATION_SCHEMA.PARTITIONS`
-- WHERE table_name = 'visits'
-- AND REGEXP_CONTAINS(partition_id, r'^\d{8}$')
-- ),
-- dst AS (
-- SELECT PARSE_DATE('%Y%m%d', partition_id) pdate, last_modified_time dst_mod
-- FROM `mad-data.inmarket_visits.INFORMATION_SCHEMA.PARTITIONS`
-- WHERE table_name = 'visits_ip'
-- AND REGEXP_CONTAINS(partition_id, r'^\d{8}$')
-- )
-- SELECT src.pdate FROM src
-- LEFT JOIN dst using (pdate)
-- WHERE src.src_mod > dst.dst_mod OR dst.dst_mod IS NULL
-- );
-- STEP 2 read new data
CREATE TEMP TABLE base AS (
WITH advs_last_visit AS (
SELECT
advertiser_id,
MAX(visit_timestamp_iso8601) last_visit,
FROM `mad-data.inmarket_visits.visits_ip`
GROUP BY 1
)
SELECT
TIMESTAMP(visit_time_iso8601) AS visit_timestamp_iso8601,
REPLACE(LOWER(did), '-', '') AS did,
brand AS brand,
dmaid AS dma_id,
dma AS dma_name,
commercial_id AS commercial_id,
address1 AS address1,
address2 AS address2,
city AS city,
state AS state,
zip AS zip,
confidence_score AS confidence_score,
visits.advertising_id AS advertiser_id,
location_id AS location_id,
clean_addr AS clean_addr,
FARM_FINGERPRINT(CONCAT(
IF(visits.address1 IS NULL OR LENGTH(visits.address1) = 0, visits.zip, visits.address1),
CAST(TIMESTAMP(visits.visit_time_iso8601) AS STRING),
visits.did
)) AS visit_tx,
FARM_FINGERPRINT(CONCAT(
visits.location_id,
CAST(TIMESTAMP(visits.visit_time_iso8601) AS STRING),
REPLACE(LOWER(did), '-', '')
)) AS visit_tx_v2,
--
CURRENT_TIMESTAMP AS inserted_at,
FROM `cptsrewards-hrd.im_madhive_vs.visits` visits
FULL JOIN mad-data.inmarket_visits.bq_migration advertisers_transition ON visits.advertising_id = advertisers_transition.advertiser_id
FULL JOIN advs_last_visit ON visits.advertising_id = advs_last_visit.advertiser_id
WHERE
(DATE(TIMESTAMP(visit_time_iso8601),'America/New_York') >= advertisers_transition.start_date
OR advertisers_transition.advertiser_id IS NULL) -- if not in the migration table, assume it is set to migrate
--
-- < 7 days: accept everything
-- > 90 days: do not accept anything
-- between 7 and 90 days: accept if it's newer than the newest visit
--
-- AND (
-- DATE_DIFF(CURRENT_DATE('America/New_York'), DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York'), DAY) <= 7
-- OR
-- (DATE_DIFF(CURRENT_DATE('America/New_York'), DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York'), DAY) <= 90
-- AND DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York') > DATE(advs_last_visit.last_visit, 'America/New_York'))
-- OR
-- (DATE_DIFF(CURRENT_DATE('America/New_York'), DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York'), DAY) <= 90
-- AND advs_last_visit.last_visit IS NULL)
-- )
-- To run a full backfill, just comment this line:
--AND DATE(visit_time_iso8601) IN UNNEST(DIRTY_DATES_UTC)
AND DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York') >= '2023-04-01'
);
-- STEP 3 Map what DG builds to use for each visit
CREATE TEMP TABLE visit_dates AS (
SELECT DISTINCT
DATE(visit_timestamp_iso8601) visit_date,
FROM base
);
SET MIN_VISIT_DATE = (SELECT MIN(visit_date) FROM visit_dates);
CREATE TEMP TABLE visit_to_build AS (
WITH dg_builds AS (
SELECT DISTINCT
dg.build AS dg_build,
FROM mad-data.devicegraph.v5_by_device dg
WHERE DATE(dg.build) >= DATE_SUB(MIN_VISIT_DATE, INTERVAL 7 DAY)
)
SELECT
visit_date,
MAX(dg_build) AS latest_dg_build
FROM visit_dates
JOIN dg_builds
ON DATE(dg_build) < visit_date
GROUP BY 1
);
-- STEP 4: Add IP to visits
-- Create an empty table to insert data in the loop
CREATE TEMP TABLE visits_with_ip AS (
SELECT
*,
current_timestamp() AS latest_dg_build,
"" AS ip,
FROM base
limit 0
);
FOR current_visit_date IN (SELECT visit_date FROM visit_dates) DO
SET LATEST_BUILD = (SELECT MAX(latest_dg_build) FROM visit_to_build WHERE visit_date = current_visit_date.visit_date);
INSERT INTO visits_with_ip
WITH dg as
(
SELECT
ip
,device
FROM
mad-data.devicegraph.v5_by_device
WHERE
build = LATEST_BUILD
)
SELECT
visits.*,
LATEST_BUILD,
dg.ip,
FROM base visits
JOIN dg
ON visits.did = dg.device
WHERE DATE(visit_timestamp_iso8601) = current_visit_date.visit_date
AND dg.ip IS NOT NULL
;
END FOR;
DECLARE DIRTY_DATES_UTC ARRAY<DATE>;
DECLARE MIN_VISIT_DATE DATE;
DECLARE LATEST_BUILD TIMESTAMP;
-- STEP 1 check new data (dirty partitions)
-- SET DIRTY_DATES_UTC = ARRAY(
-- WITH
-- src AS (
-- SELECT PARSE_DATE('%Y%m%d', partition_id) pdate, last_modified_time src_mod
-- FROM `cptsrewards-hrd.im_madhive_vs.INFORMATION_SCHEMA.PARTITIONS`
-- WHERE table_name = 'visits'
-- AND REGEXP_CONTAINS(partition_id, r'^\d{8}$')
-- ),
-- dst AS (
-- SELECT PARSE_DATE('%Y%m%d', partition_id) pdate, last_modified_time dst_mod
-- FROM `mad-data.inmarket_visits.INFORMATION_SCHEMA.PARTITIONS`
-- WHERE table_name = 'visits_ip'
-- AND REGEXP_CONTAINS(partition_id, r'^\d{8}$')
-- )
-- SELECT src.pdate FROM src
-- LEFT JOIN dst using (pdate)
-- WHERE src.src_mod > dst.dst_mod OR dst.dst_mod IS NULL
-- );
-- STEP 2 read new data
CREATE TEMP TABLE base AS (
WITH advs_last_visit AS (
SELECT
advertiser_id,
MAX(visit_timestamp_iso8601) last_visit,
FROM `mad-data.inmarket_visits.visits_ip`
GROUP BY 1
)
SELECT
TIMESTAMP(visit_time_iso8601) AS visit_timestamp_iso8601,
REPLACE(LOWER(did), '-', '') AS did,
brand AS brand,
dmaid AS dma_id,
dma AS dma_name,
commercial_id AS commercial_id,
address1 AS address1,
address2 AS address2,
city AS city,
state AS state,
zip AS zip,
confidence_score AS confidence_score,
visits.advertising_id AS advertiser_id,
location_id AS location_id,
clean_addr AS clean_addr,
FARM_FINGERPRINT(CONCAT(
IF(visits.address1 IS NULL OR LENGTH(visits.address1) = 0, visits.zip, visits.address1),
CAST(TIMESTAMP(visits.visit_time_iso8601) AS STRING),
visits.did
)) AS visit_tx,
FARM_FINGERPRINT(CONCAT(
visits.location_id,
CAST(TIMESTAMP(visits.visit_time_iso8601) AS STRING),
REPLACE(LOWER(did), '-', '')
)) AS visit_tx_v2,
--
CURRENT_TIMESTAMP AS inserted_at,
FROM `cptsrewards-hrd.im_madhive_vs.visits` visits
FULL JOIN mad-data.inmarket_visits.bq_migration advertisers_transition ON visits.advertising_id = advertisers_transition.advertiser_id
FULL JOIN advs_last_visit ON visits.advertising_id = advs_last_visit.advertiser_id
WHERE
(DATE(TIMESTAMP(visit_time_iso8601),'America/New_York') >= advertisers_transition.start_date
OR advertisers_transition.advertiser_id IS NULL) -- if not in the migration table, assume it is set to migrate
--
-- < 7 days: accept everything
-- > 90 days: do not accept anything
-- between 7 and 90 days: accept if it's newer than the newest visit
--
-- AND (
-- DATE_DIFF(CURRENT_DATE('America/New_York'), DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York'), DAY) <= 7
-- OR
-- (DATE_DIFF(CURRENT_DATE('America/New_York'), DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York'), DAY) <= 90
-- AND DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York') > DATE(advs_last_visit.last_visit, 'America/New_York'))
-- OR
-- (DATE_DIFF(CURRENT_DATE('America/New_York'), DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York'), DAY) <= 90
-- AND advs_last_visit.last_visit IS NULL)
-- )
-- To run a full backfill, just comment this line:
--AND DATE(visit_time_iso8601) IN UNNEST(DIRTY_DATES_UTC)
AND DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York') >= '2023-04-01'
);
-- STEP 3 Map what DG builds to use for each visit
CREATE TEMP TABLE visit_dates AS (
SELECT DISTINCT
DATE(visit_timestamp_iso8601) visit_date,
FROM base
);
SET MIN_VISIT_DATE = (SELECT MIN(visit_date) FROM visit_dates);
CREATE TEMP TABLE visit_to_build AS (
-- WITH dg_builds AS (
-- SELECT DISTINCT
-- dg.build AS dg_build,
-- FROM mad-data.devicegraph.v5_by_device dg
-- WHERE build = '2023-05-27T10:00:38.089502' --DATE(dg.build) >= DATE_SUB(MIN_VISIT_DATE, INTERVAL 7 DAY)
-- )
SELECT
visit_date,
TIMESTAMP('2023-05-27 10:00:38.089502 UTC') AS latest_dg_build
FROM visit_dates
--JOIN dg_builds
--ON DATE(dg_build) < visit_date
GROUP BY 1
);
-- STEP 4: Add IP to visits
-- Create an empty table to insert data in the loop
CREATE TEMP TABLE visits_with_ip AS (
SELECT
*,
current_timestamp() AS latest_dg_build,
"" AS ip,
FROM base
limit 0
);
FOR current_visit_date IN (SELECT visit_date FROM visit_dates) DO
SET LATEST_BUILD = (SELECT MAX(latest_dg_build) FROM visit_to_build WHERE visit_date = current_visit_date.visit_date);
INSERT INTO visits_with_ip
WITH dg as
(
SELECT
ip
,device
FROM
mad-data.devicegraph.v5_by_device
WHERE
build = LATEST_BUILD
)
SELECT
visits.*,
LATEST_BUILD,
dg.ip,
FROM base visits
JOIN dg
ON visits.did = dg.device
WHERE DATE(visit_timestamp_iso8601) = current_visit_date.visit_date
AND dg.ip IS NOT NULL
;
END FOR;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment