Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save MrCordeiro/5be54914d95f6c9970825726ccf0f42d to your computer and use it in GitHub Desktop.
Save MrCordeiro/5be54914d95f6c9970825726ccf0f42d to your computer and use it in GitHub Desktop.
Time series train-dev split with cumulative data
"""
Simple data split between training and development sets
The training set will have at least 6 months of data (26 weeks)
with every bath adding a new week.
The development set will have a single week of data.
"""
import datetime
import math
from pyspark.sql import types
from pyspark.sql import functions as f
DAYS_IN_WEEK = 7
MIN_WEEKS_IN_TRAIN = 26
WEEKS_IN_DEV = 1
stride = MIN_WEEKS_IN_TRAIN + WEEKS_IN_DEV
# Create empty train and dev sets
new_column_field = [types.StructField("batch", types.DateType(), True)]
schema = types.StructType(df.schema.fields + new_column_field)
train_df = spark.createDataFrame(data=[], schema=schema)
dev_df = spark.createDataFrame(data=[], schema=schema)
# Set time-windows
start_date = df.agg({"date": "min"}).collect()[0][0]
end_date = df.agg({"date": "max"}).collect()[0][0]
num_days = ((end_date - start_date) - datetime.timedelta(weeks=stride)).days
num_weeks = math.floor(num_days / DAYS_IN_WEEK)
split_dates = [
start_date
+ datetime.timedelta(weeks=stride)
+ datetime.timedelta(weeks=week_nb)
for week_nb in range(num_weeks)
]
# Create instances
for split_date in split_dates[:1]:
# Training set
week_train_start = start_date
week_train_end = split_date
train_data = (
df
.filter(f.col("date").between(week_train_start, week_train_end))
.withColumn("batch", f.lit(split_date))
)
train_df = train_df.union(train_data)
# Dev set
week_dev_start = week_train_end + datetime.timedelta(days=1)
week_dev_end = week_dev_start + datetime.timedelta(weeks=WEEKS_IN_DEV)
dev_data = (
df
.filter(f.col("date").between(week_dev_start, week_dev_end))
.withColumn("batch", f.lit(split_date))
)
dev_df = dev_df.union(dev_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment