Created
May 30, 2023 17:54
-
-
Save pliniosilveira/960cd8e3c2c0d955f27a464e94551d12 to your computer and use it in GitHub Desktop.
Testing new DG build -- just visits ingestion step
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DECLARE DIRTY_DATES_UTC ARRAY<DATE>; | |
DECLARE MIN_VISIT_DATE DATE; | |
DECLARE LATEST_BUILD TIMESTAMP; | |
-- STEP 1 check new data (dirty partitions) | |
-- SET DIRTY_DATES_UTC = ARRAY( | |
-- WITH | |
-- src AS ( | |
-- SELECT PARSE_DATE('%Y%m%d', partition_id) pdate, last_modified_time src_mod | |
-- FROM `cptsrewards-hrd.im_madhive_vs.INFORMATION_SCHEMA.PARTITIONS` | |
-- WHERE table_name = 'visits' | |
-- AND REGEXP_CONTAINS(partition_id, r'^\d{8}$') | |
-- ), | |
-- dst AS ( | |
-- SELECT PARSE_DATE('%Y%m%d', partition_id) pdate, last_modified_time dst_mod | |
-- FROM `mad-data.inmarket_visits.INFORMATION_SCHEMA.PARTITIONS` | |
-- WHERE table_name = 'visits_ip' | |
-- AND REGEXP_CONTAINS(partition_id, r'^\d{8}$') | |
-- ) | |
-- SELECT src.pdate FROM src | |
-- LEFT JOIN dst using (pdate) | |
-- WHERE src.src_mod > dst.dst_mod OR dst.dst_mod IS NULL | |
-- ); | |
-- STEP 2 read new data | |
CREATE TEMP TABLE base AS ( | |
WITH advs_last_visit AS ( | |
SELECT | |
advertiser_id, | |
MAX(visit_timestamp_iso8601) last_visit, | |
FROM `mad-data.inmarket_visits.visits_ip` | |
GROUP BY 1 | |
) | |
SELECT | |
TIMESTAMP(visit_time_iso8601) AS visit_timestamp_iso8601, | |
REPLACE(LOWER(did), '-', '') AS did, | |
brand AS brand, | |
dmaid AS dma_id, | |
dma AS dma_name, | |
commercial_id AS commercial_id, | |
address1 AS address1, | |
address2 AS address2, | |
city AS city, | |
state AS state, | |
zip AS zip, | |
confidence_score AS confidence_score, | |
visits.advertising_id AS advertiser_id, | |
location_id AS location_id, | |
clean_addr AS clean_addr, | |
FARM_FINGERPRINT(CONCAT( | |
IF(visits.address1 IS NULL OR LENGTH(visits.address1) = 0, visits.zip, visits.address1), | |
CAST(TIMESTAMP(visits.visit_time_iso8601) AS STRING), | |
visits.did | |
)) AS visit_tx, | |
FARM_FINGERPRINT(CONCAT( | |
visits.location_id, | |
CAST(TIMESTAMP(visits.visit_time_iso8601) AS STRING), | |
REPLACE(LOWER(did), '-', '') | |
)) AS visit_tx_v2, | |
-- | |
CURRENT_TIMESTAMP AS inserted_at, | |
FROM `cptsrewards-hrd.im_madhive_vs.visits` visits | |
FULL JOIN mad-data.inmarket_visits.bq_migration advertisers_transition ON visits.advertising_id = advertisers_transition.advertiser_id | |
FULL JOIN advs_last_visit ON visits.advertising_id = advs_last_visit.advertiser_id | |
WHERE | |
(DATE(TIMESTAMP(visit_time_iso8601),'America/New_York') >= advertisers_transition.start_date | |
OR advertisers_transition.advertiser_id IS NULL) -- if not in the migration table, assume it is set to migrate | |
-- | |
-- < 7 days: accept everything | |
-- > 90 days: do not accept anything | |
-- between 7 and 90 days: accept if it's newer than the newest visit | |
-- | |
-- AND ( | |
-- DATE_DIFF(CURRENT_DATE('America/New_York'), DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York'), DAY) <= 7 | |
-- OR | |
-- (DATE_DIFF(CURRENT_DATE('America/New_York'), DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York'), DAY) <= 90 | |
-- AND DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York') > DATE(advs_last_visit.last_visit, 'America/New_York')) | |
-- OR | |
-- (DATE_DIFF(CURRENT_DATE('America/New_York'), DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York'), DAY) <= 90 | |
-- AND advs_last_visit.last_visit IS NULL) | |
-- ) | |
-- To run a full backfill, just comment this line: | |
--AND DATE(visit_time_iso8601) IN UNNEST(DIRTY_DATES_UTC) | |
AND DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York') >= '2023-04-01' | |
); | |
-- STEP 3 Map what DG builds to use for each visit | |
CREATE TEMP TABLE visit_dates AS ( | |
SELECT DISTINCT | |
DATE(visit_timestamp_iso8601) visit_date, | |
FROM base | |
); | |
SET MIN_VISIT_DATE = (SELECT MIN(visit_date) FROM visit_dates); | |
CREATE TEMP TABLE visit_to_build AS ( | |
WITH dg_builds AS ( | |
SELECT DISTINCT | |
dg.build AS dg_build, | |
FROM mad-data.devicegraph.v5_by_device dg | |
WHERE DATE(dg.build) >= DATE_SUB(MIN_VISIT_DATE, INTERVAL 7 DAY) | |
) | |
SELECT | |
visit_date, | |
MAX(dg_build) AS latest_dg_build | |
FROM visit_dates | |
JOIN dg_builds | |
ON DATE(dg_build) < visit_date | |
GROUP BY 1 | |
); | |
-- STEP 4: Add IP to visits | |
-- Create an empty table to insert data in the loop | |
CREATE TEMP TABLE visits_with_ip AS ( | |
SELECT | |
*, | |
current_timestamp() AS latest_dg_build, | |
"" AS ip, | |
FROM base | |
limit 0 | |
); | |
FOR current_visit_date IN (SELECT visit_date FROM visit_dates) DO | |
SET LATEST_BUILD = (SELECT MAX(latest_dg_build) FROM visit_to_build WHERE visit_date = current_visit_date.visit_date); | |
INSERT INTO visits_with_ip | |
WITH dg as | |
( | |
SELECT | |
ip | |
,device | |
FROM | |
mad-data.devicegraph.v5_by_device | |
WHERE | |
build = LATEST_BUILD | |
) | |
SELECT | |
visits.*, | |
LATEST_BUILD, | |
dg.ip, | |
FROM base visits | |
JOIN dg | |
ON visits.did = dg.device | |
WHERE DATE(visit_timestamp_iso8601) = current_visit_date.visit_date | |
AND dg.ip IS NOT NULL | |
; | |
END FOR; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DECLARE DIRTY_DATES_UTC ARRAY<DATE>; | |
DECLARE MIN_VISIT_DATE DATE; | |
DECLARE LATEST_BUILD TIMESTAMP; | |
-- STEP 1 check new data (dirty partitions) | |
-- SET DIRTY_DATES_UTC = ARRAY( | |
-- WITH | |
-- src AS ( | |
-- SELECT PARSE_DATE('%Y%m%d', partition_id) pdate, last_modified_time src_mod | |
-- FROM `cptsrewards-hrd.im_madhive_vs.INFORMATION_SCHEMA.PARTITIONS` | |
-- WHERE table_name = 'visits' | |
-- AND REGEXP_CONTAINS(partition_id, r'^\d{8}$') | |
-- ), | |
-- dst AS ( | |
-- SELECT PARSE_DATE('%Y%m%d', partition_id) pdate, last_modified_time dst_mod | |
-- FROM `mad-data.inmarket_visits.INFORMATION_SCHEMA.PARTITIONS` | |
-- WHERE table_name = 'visits_ip' | |
-- AND REGEXP_CONTAINS(partition_id, r'^\d{8}$') | |
-- ) | |
-- SELECT src.pdate FROM src | |
-- LEFT JOIN dst using (pdate) | |
-- WHERE src.src_mod > dst.dst_mod OR dst.dst_mod IS NULL | |
-- ); | |
-- STEP 2 read new data | |
CREATE TEMP TABLE base AS ( | |
WITH advs_last_visit AS ( | |
SELECT | |
advertiser_id, | |
MAX(visit_timestamp_iso8601) last_visit, | |
FROM `mad-data.inmarket_visits.visits_ip` | |
GROUP BY 1 | |
) | |
SELECT | |
TIMESTAMP(visit_time_iso8601) AS visit_timestamp_iso8601, | |
REPLACE(LOWER(did), '-', '') AS did, | |
brand AS brand, | |
dmaid AS dma_id, | |
dma AS dma_name, | |
commercial_id AS commercial_id, | |
address1 AS address1, | |
address2 AS address2, | |
city AS city, | |
state AS state, | |
zip AS zip, | |
confidence_score AS confidence_score, | |
visits.advertising_id AS advertiser_id, | |
location_id AS location_id, | |
clean_addr AS clean_addr, | |
FARM_FINGERPRINT(CONCAT( | |
IF(visits.address1 IS NULL OR LENGTH(visits.address1) = 0, visits.zip, visits.address1), | |
CAST(TIMESTAMP(visits.visit_time_iso8601) AS STRING), | |
visits.did | |
)) AS visit_tx, | |
FARM_FINGERPRINT(CONCAT( | |
visits.location_id, | |
CAST(TIMESTAMP(visits.visit_time_iso8601) AS STRING), | |
REPLACE(LOWER(did), '-', '') | |
)) AS visit_tx_v2, | |
-- | |
CURRENT_TIMESTAMP AS inserted_at, | |
FROM `cptsrewards-hrd.im_madhive_vs.visits` visits | |
FULL JOIN mad-data.inmarket_visits.bq_migration advertisers_transition ON visits.advertising_id = advertisers_transition.advertiser_id | |
FULL JOIN advs_last_visit ON visits.advertising_id = advs_last_visit.advertiser_id | |
WHERE | |
(DATE(TIMESTAMP(visit_time_iso8601),'America/New_York') >= advertisers_transition.start_date | |
OR advertisers_transition.advertiser_id IS NULL) -- if not in the migration table, assume it is set to migrate | |
-- | |
-- < 7 days: accept everything | |
-- > 90 days: do not accept anything | |
-- between 7 and 90 days: accept if it's newer than the newest visit | |
-- | |
-- AND ( | |
-- DATE_DIFF(CURRENT_DATE('America/New_York'), DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York'), DAY) <= 7 | |
-- OR | |
-- (DATE_DIFF(CURRENT_DATE('America/New_York'), DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York'), DAY) <= 90 | |
-- AND DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York') > DATE(advs_last_visit.last_visit, 'America/New_York')) | |
-- OR | |
-- (DATE_DIFF(CURRENT_DATE('America/New_York'), DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York'), DAY) <= 90 | |
-- AND advs_last_visit.last_visit IS NULL) | |
-- ) | |
-- To run a full backfill, just comment this line: | |
--AND DATE(visit_time_iso8601) IN UNNEST(DIRTY_DATES_UTC) | |
AND DATE(TIMESTAMP(visit_time_iso8601), 'America/New_York') >= '2023-04-01' | |
); | |
-- STEP 3 Map what DG builds to use for each visit | |
CREATE TEMP TABLE visit_dates AS ( | |
SELECT DISTINCT | |
DATE(visit_timestamp_iso8601) visit_date, | |
FROM base | |
); | |
SET MIN_VISIT_DATE = (SELECT MIN(visit_date) FROM visit_dates); | |
CREATE TEMP TABLE visit_to_build AS ( | |
-- WITH dg_builds AS ( | |
-- SELECT DISTINCT | |
-- dg.build AS dg_build, | |
-- FROM mad-data.devicegraph.v5_by_device dg | |
-- WHERE build = '2023-05-27T10:00:38.089502' --DATE(dg.build) >= DATE_SUB(MIN_VISIT_DATE, INTERVAL 7 DAY) | |
-- ) | |
SELECT | |
visit_date, | |
TIMESTAMP('2023-05-27 10:00:38.089502 UTC') AS latest_dg_build | |
FROM visit_dates | |
--JOIN dg_builds | |
--ON DATE(dg_build) < visit_date | |
GROUP BY 1 | |
); | |
-- STEP 4: Add IP to visits | |
-- Create an empty table to insert data in the loop | |
CREATE TEMP TABLE visits_with_ip AS ( | |
SELECT | |
*, | |
current_timestamp() AS latest_dg_build, | |
"" AS ip, | |
FROM base | |
limit 0 | |
); | |
FOR current_visit_date IN (SELECT visit_date FROM visit_dates) DO | |
SET LATEST_BUILD = (SELECT MAX(latest_dg_build) FROM visit_to_build WHERE visit_date = current_visit_date.visit_date); | |
INSERT INTO visits_with_ip | |
WITH dg as | |
( | |
SELECT | |
ip | |
,device | |
FROM | |
mad-data.devicegraph.v5_by_device | |
WHERE | |
build = LATEST_BUILD | |
) | |
SELECT | |
visits.*, | |
LATEST_BUILD, | |
dg.ip, | |
FROM base visits | |
JOIN dg | |
ON visits.did = dg.device | |
WHERE DATE(visit_timestamp_iso8601) = current_visit_date.visit_date | |
AND dg.ip IS NOT NULL | |
; | |
END FOR; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment