Created
October 4, 2019 19:50
-
-
Save rasmi/35b2cd24106954b3c1e1f40ba72d125a to your computer and use it in GitHub Desktop.
BigQuery time series aggregation example by Andrew Leach.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2019 Google LLC. | |
# SPDX-License-Identifier: Apache-2.0 | |
WITH | |
# Generate a sample time series from the taxi trips dataset. | |
sample_time_series AS ( | |
SELECT | |
SUBSTR(taxi_id, 16, 24) AS taxi_id, # Truncate for readability. | |
trip_start_timestamp, | |
trip_seconds | |
FROM | |
`bigquery-public-data.chicago_taxi_trips.taxi_trips` | |
WHERE | |
SUBSTR(taxi_id, 0, 2) = 'cc' # Downsample data based on hash | |
ORDER BY | |
taxi_id, | |
trip_start_timestamp | |
) | |
# Predict the trip seconds for the next ride. | |
# Note that time intervals here are not regular (ex. hourly). | |
# Some cleanup still required, such as removing next_trip_seconds nulls | |
# and sequences that are too short. | |
# Can create a second window for a different level of aggregation for | |
# time series history. | |
SELECT | |
taxi_id, | |
trip_start_timestamp, | |
trip_seconds, | |
LEAD(trip_seconds, 1) OVER ( | |
PARTITION BY | |
taxi_id | |
ORDER BY | |
trip_start_timestamp | |
) AS next_trip_seconds, | |
# ARRAY_AGG cannot aggreggate nulls. | |
ARRAY_AGG(IFNULL(trip_seconds, 0)) OVER ( | |
PARTITION BY | |
taxi_id | |
ORDER BY | |
# AutoML Tables will treat as an ordered array | |
# OFFSET(0) in array should be the oldest element | |
trip_start_timestamp | |
ROWS BETWEEN 10 PRECEDING AND CURRENT ROW | |
) AS trip_history | |
FROM | |
sample_time_series |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment