- Constants
- Series generation
- Reading JSON
- Schemas
- Day of week
https://stackoverflow.com/questions/2201170/how-to-select-multiple-rows-filled-with-constants
In PostgreSQL, you can do:
SELECT * FROM (
VALUES (1, 2), (3, 4)
) AS q (col1, col2)
In other systems, just use UNION ALL:
SELECT 1 AS col1, 2 AS col2
-- FROM dual
-- uncomment the line above if in Oracle
UNION ALL
SELECT 3 AS col1, 3 AS col2
-- FROM dual
-- uncomment the line above if in Oracle
with values as (
select (1, 2, 3)
)
select * from values
named_struct(col1, 1, col2, 2, col3, 3) 1 {"col1": 1, "col2": 2, "col3": 3}
with values (colA, colB) as (
select (1, 2, 3), (1, 2, 3)
)
select * from values
() causes named_struct
with values (colA, colB, colC) as (
select 1, 2, 3
)
select * from values
looks more 'normal'
with values (colA, colB, colC) as (
select 1, 2, 3
union all
select 4, 5, 6
)
select * from values
colA colB colC
1 2 3
4 5 6
In postgres
WITH days_list AS (
SELECT generate_series((current_date - INTERVAL '7 days'), current_date, '1 day'::INTERVAL)
)
select * from days_list;
SELECT generate_series(start, stop, '1 day'::interval)::date AS dt FROM my_table
In presto
with days_list as (
select to_iso8601(cast(days as date)) as day
from (unnest(sequence(date(current_date - INTERVAL '7' DAY), date(current_date), interval '1' day))) as seq (days)
)
select * from days_list
Spark
https://stackoverflow.com/questions/43141671/sparksql-on-pyspark-how-to-generate-time-series
from pyspark.sql.functions import sequence, to_date, explode, col
spark.sql("SELECT sequence(to_date('2018-01-01'), to_date('2018-03-01'), interval 1 month) as date")
Then use explode to pivot to rows
spark.sql("SELECT sequence(to_date('2018-01-01'), to_date('2018-03-01'), interval 1 month) as date").withColumn("date", explode(col("date"))
Another way is to greate a dataframe from raw date data
with days as (
SELECT explode(sequence(to_date('2018-01-01'), to_date('2018-03-01'), interval 1 month)) as date
)
select * from days
with days as (
SELECT explode(sequence(date_add('2022-05-27', -7) , date_add('2022-05-27', +7) , interval 1 day)) as date
)
select * from days
create or replace temporary view days as (
with input as (
select
'2022-05-27' as day,
7 as range
), day_series as (
select explode(sequence(date_add((select day from input), -1*(select range from input)), date_add((select day from input), (select range from input)), interval 1 day)) as days
) select * from day_series
)
with days as (
SELECT sequence(to_date('2018-01-01'), to_date('2018-03-01'), interval 1 month) as date
)
where day between date_add('2022-05-27', -7) and date_add('2022-05-27', +7)
Easiest if the source files are in "JSON Lines" format, where each line of the file is a self-contained JSON object.
For a regular JSON file, use .option("multiLine", "true")
spark.read.json(s3_path)
Sources:
df.schema (struct)
add callable:True
fieldNames callable:True
fields callable:False
fromInternal callable:True
fromJson callable:True
json callable:True
jsonValue callable:True
names callable:False
needConversion callable:True
simpleString callable:True
toInternal callable:True
typeName callable:True
df.schema.fieldNames()
['event_time', 'event_type ..
df.schema.fields
[StructField(event_time,StringType,true), StructF...
df.schema.simpleString()
struct<event_time:string,event_type:string,inputs:array<stru...
df.schema.json()
{"fields":[{"metadata":{},"name":"event_time","nullable":true,"type":"st
df.dtypes()
[('event_time', 'string'), ('event_type', 'string'), ('in
df.printSchema()
root
|-- event_time: string (nullable = true)
|-- event_type: string (nullable = true)
|-- inputs: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- bytes: long (nullable = true)
| | |-- name: string (nullable = true)
As a string
date_format(my_timestamp, 'EEEE')
As a number
weekday('2009-07-30')
-- 0 Monday, 1 Tuesday... 6 Sunday
SELECT
aggregate(
interval_data ['usage'],
double(0.0),(a, y) -> a + y
) as usage
from
source_data
where
day = current_date -2
and id = '3DD'