robrich · October 12, 2020 19:34
diff --git a/time-series.sql b/time-series.sql
 -- TIME SERIES
 -- ===========

 -- setup schema
 CREATE DATABASE temp_history;

 USE temp_history;

 CREATE TABLE temperatures (
  location VARCHAR(200) NOT NULL,
  read_date DATETIME(6) NOT NULL,
  latitude DOUBLE,
  longitude DOUBLE,
  temperatureF DOUBLE,
  KEY (read_date, location) USING CLUSTERED COLUMNSTORE
 );

 -- load values
 -- Thank you to https://www.ncdc.noaa.gov/ for 2010's data
 CREATE PIPELINE temperatures
 AS LOAD DATA FS '/vagrant/noaa-weather-data.txt'
 INTO TABLE temperatures
 FORMAT CSV
 FIELDS TERMINATED BY '\t'
 LINES TERMINATED BY '\n'
 IGNORE 1 LINES;

 -- start pipeline
 TEST PIPELINE temperatures LIMIT 10;
 START PIPELINE temperatures FOREGROUND LIMIT 1 BATCHES;
 START PIPELINE temperatures;

 -- verify pipeline
 SELECT * FROM temperatures;
 SELECT count(*) FROM temperatures;
 SELECT * FROM information_schema.PIPELINES_BATCHES_SUMMARY;
 SELECT location, count(*) FROM temperatures GROUP BY 1;


 -- Average temperature all year
 SELECT location, ROUND(AVG(temperatureF),1) as 'avg temp F' FROM temperatures GROUP BY location;

 -- Average temperature by day
 SELECT location, read_date :> date,
 ROUND(AVG(temperatureF), 1) as 'avg temp F', MIN(temperatureF), MAX(temperatureF)
 FROM temperatures
 GROUP by 1, 2
 ORDER BY 1, 2;

 -- Average weekly temperature in March
 SELECT location, TIME_BUCKET("7d", read_date) as 'week',
 ROUND(AVG(temperatureF), 1) as 'avg temp F', MIN(temperatureF), MAX(temperatureF)
 FROM temperatures
 WHERE read_date >= '2010-03-01' AND read_date <= '2010-03-31'
 GROUP BY 1, 2 ORDER BY 1, 2;


 -- Candlestick chart for July
 WITH ranked AS (
  SELECT location, read_date,
  RANK() OVER w as r,
  MIN(temperatureF) over w as 'min',
  MAX(temperatureF) over w as 'max',
  FIRST_VALUE(temperatureF) over w as 'first',
  LAST_VALUE(temperatureF) over w as 'last'
  FROM temperatures
  WINDOW w AS (
    PARTITION BY location, time_bucket('1d', read_date)
    ORDER BY read_date
    ROWS BETWEEN UNBOUNDED PRECEDING
    AND UNBOUNDED FOLLOWING
  )
 )
 SELECT location, time_bucket('1d', read_date) as 'date',
 min, max, first, last
 FROM ranked
 WHERE r = 1
 AND (read_date >= '2010-07-01' AND read_date <= '2010-07-31')
 ORDER BY 1, 2;


 -- smoothing: average 3 preceeding rows into current row 
 SELECT location, read_date, temperatureF,
 AVG(temperatureF) OVER (ORDER BY location, read_date ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS smoothed_temp
 FROM temperatures
 WHERE (read_date >= '2010-03-01' AND read_date <= '2010-03-31')
 ORDER BY 1, 2;


 -- Cleanup
 STOP PIPELINE temperatures;
 DROP PIPELINE temperatures;
 DROP TABLE temperatures;
 DROP DATABASE temp_history;
	-- TIME SERIES
	-- ===========

	-- setup schema
	CREATE DATABASE temp_history;

	USE temp_history;

	CREATE TABLE temperatures (
	location VARCHAR(200) NOT NULL,
	read_date DATETIME(6) NOT NULL,
	latitude DOUBLE,
	longitude DOUBLE,
	temperatureF DOUBLE,
	KEY (read_date, location) USING CLUSTERED COLUMNSTORE
	);

	-- load values
	-- Thank you to https://www.ncdc.noaa.gov/ for 2010's data
	CREATE PIPELINE temperatures
	AS LOAD DATA FS '/vagrant/noaa-weather-data.txt'
	INTO TABLE temperatures
	FORMAT CSV
	FIELDS TERMINATED BY '\t'
	LINES TERMINATED BY '\n'
	IGNORE 1 LINES;

	-- start pipeline
	TEST PIPELINE temperatures LIMIT 10;
	START PIPELINE temperatures FOREGROUND LIMIT 1 BATCHES;
	START PIPELINE temperatures;

	-- verify pipeline
	SELECT * FROM temperatures;
	SELECT count(*) FROM temperatures;
	SELECT * FROM information_schema.PIPELINES_BATCHES_SUMMARY;
	SELECT location, count(*) FROM temperatures GROUP BY 1;


	-- Average temperature all year
	SELECT location, ROUND(AVG(temperatureF),1) as 'avg temp F' FROM temperatures GROUP BY location;

	-- Average temperature by day
	SELECT location, read_date :> date,
	ROUND(AVG(temperatureF), 1) as 'avg temp F', MIN(temperatureF), MAX(temperatureF)
	FROM temperatures
	GROUP by 1, 2
	ORDER BY 1, 2;

	-- Average weekly temperature in March
	SELECT location, TIME_BUCKET("7d", read_date) as 'week',
	ROUND(AVG(temperatureF), 1) as 'avg temp F', MIN(temperatureF), MAX(temperatureF)
	FROM temperatures
	WHERE read_date >= '2010-03-01' AND read_date <= '2010-03-31'
	GROUP BY 1, 2 ORDER BY 1, 2;


	-- Candlestick chart for July
	WITH ranked AS (
	SELECT location, read_date,
	RANK() OVER w as r,
	MIN(temperatureF) over w as 'min',
	MAX(temperatureF) over w as 'max',
	FIRST_VALUE(temperatureF) over w as 'first',
	LAST_VALUE(temperatureF) over w as 'last'
	FROM temperatures
	WINDOW w AS (
	PARTITION BY location, time_bucket('1d', read_date)
	ORDER BY read_date
	ROWS BETWEEN UNBOUNDED PRECEDING
	AND UNBOUNDED FOLLOWING
	)
	)
	SELECT location, time_bucket('1d', read_date) as 'date',
	min, max, first, last
	FROM ranked
	WHERE r = 1
	AND (read_date >= '2010-07-01' AND read_date <= '2010-07-31')
	ORDER BY 1, 2;


	-- smoothing: average 3 preceeding rows into current row
	SELECT location, read_date, temperatureF,
	AVG(temperatureF) OVER (ORDER BY location, read_date ROWS BETWEEN 3 PRECEDING AND CURRENT ROW) AS smoothed_temp
	FROM temperatures
	WHERE (read_date >= '2010-03-01' AND read_date <= '2010-03-31')
	ORDER BY 1, 2;


	-- Cleanup
	STOP PIPELINE temperatures;
	DROP PIPELINE temperatures;
	DROP TABLE temperatures;
	DROP DATABASE temp_history;