Created
October 12, 2020 19:34
-
-
Save robrich/462c31f5861628be8b571c8d5ee4726c to your computer and use it in GitHub Desktop.
time-series.sql
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- TIME SERIES | |
-- =========== | |
-- setup schema | |
CREATE DATABASE temp_history; | |
USE temp_history; | |
CREATE TABLE temperatures ( | |
location VARCHAR(200) NOT NULL, | |
read_date DATETIME(6) NOT NULL, | |
latitude DOUBLE, | |
longitude DOUBLE, | |
temperatureF DOUBLE, | |
KEY (read_date, location) USING CLUSTERED COLUMNSTORE | |
); | |
-- load values | |
-- Thank you to https://www.ncdc.noaa.gov/ for 2010's data | |
CREATE PIPELINE temperatures | |
AS LOAD DATA FS '/vagrant/noaa-weather-data.txt' | |
INTO TABLE temperatures | |
FORMAT CSV | |
FIELDS TERMINATED BY '\t' | |
LINES TERMINATED BY '\n' | |
IGNORE 1 LINES; | |
-- start pipeline | |
TEST PIPELINE temperatures LIMIT 10; | |
START PIPELINE temperatures FOREGROUND LIMIT 1 BATCHES; | |
START PIPELINE temperatures; | |
-- verify pipeline | |
SELECT * FROM temperatures; | |
SELECT count(*) FROM temperatures; | |
SELECT * FROM information_schema.PIPELINES_BATCHES_SUMMARY; | |
SELECT location, count(*) FROM temperatures GROUP BY 1; | |
-- Average temperature all year | |
SELECT location, ROUND(AVG(temperatureF),1) as 'avg temp F' FROM temperatures GROUP BY location; | |
-- Average temperature by day | |
SELECT location, read_date :> date, | |
ROUND(AVG(temperatureF), 1) as 'avg temp F', MIN(temperatureF), MAX(temperatureF) | |
FROM temperatures | |
GROUP by 1, 2 | |
ORDER BY 1, 2; | |
-- Average weekly temperature in March | |
SELECT location, TIME_BUCKET("7d", read_date) as 'week', | |
ROUND(AVG(temperatureF), 1) as 'avg temp F', MIN(temperatureF), MAX(temperatureF) | |
FROM temperatures | |
WHERE read_date >= '2010-03-01' AND read_date <= '2010-03-31' | |
GROUP BY 1, 2 ORDER BY 1, 2; | |
-- Candlestick chart for July | |
WITH ranked AS ( | |
SELECT location, read_date, | |
RANK() OVER w as r, | |
MIN(temperatureF) over w as 'min', | |
MAX(temperatureF) over w as 'max', | |
FIRST_VALUE(temperatureF) over w as 'first', | |
LAST_VALUE(temperatureF) over w as 'last' | |
FROM temperatures | |
WINDOW w AS ( | |
PARTITION BY location, time_bucket('1d', read_date) | |
ORDER BY read_date | |
ROWS BETWEEN UNBOUNDED PRECEDING | |
AND UNBOUNDED FOLLOWING | |
) | |
) | |
SELECT location, time_bucket('1d', read_date) as 'date', | |
min, max, first, last | |
FROM ranked | |
WHERE r = 1 | |
AND (read_date >= '2010-07-01' AND read_date <= '2010-07-31') | |
ORDER BY 1, 2; | |
-- smoothing: average 3 preceeding rows into current row | |
SELECT location, read_date, temperatureF, | |
AVG(temperatureF) OVER (ORDER BY location, read_date ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS smoothed_temp | |
FROM temperatures | |
WHERE (read_date >= '2010-03-01' AND read_date <= '2010-03-31') | |
ORDER BY 1, 2; | |
-- Cleanup | |
STOP PIPELINE temperatures; | |
DROP PIPELINE temperatures; | |
DROP TABLE temperatures; | |
DROP DATABASE temp_history; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment