Skip to content

Instantly share code, notes, and snippets.

@kathsherratt
Created December 5, 2022 18:46
Show Gist options
  • Save kathsherratt/c942c1a5870db243a5c6a9a066de4f6e to your computer and use it in GitHub Desktop.
Save kathsherratt/c942c1a5870db243a5c6a9a066de4f6e to your computer and use it in GitHub Desktop.
# Source: code by Hugo Gruson (Github @Bisaloo)
# https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/blob/main/.github/workflows/create-parquet.yml
library(readr)
library(arrow)
library(dplyr)
library(lubridate)
locations <- read_csv(
here::here("data-locations", "locations_eu.csv")) |>
dplyr::select(location, location_name, population)
raw_forecasts <- arrow::open_dataset(
here::here("data-processed"),
format = "csv",
partitioning = schema(model = string()),
hive_style = FALSE,
col_types = schema(
forecast_date = date32(),
target = string(),
target_end_date = date32(),
location = string(),
type = string(),
quantile = float32(),
value = float32()
)
)
raw_forecasts <- raw_forecasts |>
dplyr::mutate(
horizon = as.integer(gsub("^(\\d+) .*", "\\1", target)),
target_variable = gsub(".* (\\w+ \\w+)$", "\\1", target),
.keep = "unused"
) |>
dplyr::filter(
forecast_date >= ymd("2021-03-08")
) |>
dplyr::rename(prediction = value) |>
dplyr::left_join(locations) |>
# set forecast date to corresponding submission date
mutate(
forecast_date = ceiling_date(forecast_date, "week", week_start = 3)
)
arrow::write_parquet(raw_forecasts, "covid19-forecast-hub-europe.parquet")
# df <- arrow::read_parquet("covid19-forecast-hub-europe.parquet")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment