Skip to content

Instantly share code, notes, and snippets.

@kathsherratt
Last active July 1, 2023 16:22
Show Gist options
  • Save kathsherratt/a534bff397f0824403a2e81ba83ddbb9 to your computer and use it in GitHub Desktop.
Save kathsherratt/a534bff397f0824403a2e81ba83ddbb9 to your computer and use it in GitHub Desktop.
Get variants of concern dataset from ECDC
# Gets variants of concern dataset from ECDC
# For each variant/country finds the first period
# with variant % greater than 5% and up to 50%
# before the first peak for that variant
# Note ECDC data are in weeks, dates are indexed to Saturdays to match target end dates
#
# Example
# variant_names <- c("B.1.617.2" = "Delta", "B.1.1.529" = "Omicron")
# forecast_variants <- download_variant_introduction(introduction_percent = 5,
# country_names = "Germany",
# variant_codes = names(variant_names))
library(here)
library(dplyr)
library(readr)
library(lubridate)
download_variant_introduction <- function(country_names = NULL,
variant_codes = NULL,
introduction_percent = 5,
dominant_percent = 50,
peak_percent = 99) {
# date lookup table for ecdc year-weeks
date_range <- tibble(
daily_date = seq.Date(from = as.Date("2020-01-04"), to = Sys.Date(), by = 7),
year = isoyear(daily_date),
week = isoweek(daily_date)) %>%
group_by(year, week) %>%
filter(daily_date == min(daily_date))
# get data
variants <- read_csv("https://opendata.ecdc.europa.eu/covid19/virusvariant/csv/data.csv", progress = FALSE, show_col_types = FALSE)
if (is.null(variant_codes)) {variant_codes <- unique(variants$variant)}
if (is.null(country_names)) {country_names <- unique(variants$country)}
# filter as needed and set dates
variants <- variants %>%
filter(variant %in% variant_codes &
country %in% country_names &
source == "GISAID") %>%
mutate(year = as.numeric(substr(year_week, 1, 4)),
week = as.numeric(substr(year_week, 6, 8))) %>%
left_join(date_range, by = c("year", "week")) %>%
select(country, daily_date, variant, percent_variant)
# get the first maxima of the variant
introduction_period <- variants %>%
group_by(variant, country) %>%
arrange(daily_date) %>%
filter(percent_variant <= peak_percent) %>%
slice_max(percent_variant, with_ties = FALSE) %>%
select(variant, country, variant_peak_date = daily_date)
# get the period before the first maxima where the variant is e.g. >5<50%
var_data <- variants %>%
left_join(introduction_period, by = c("variant", "country")) %>%
filter(daily_date <= variant_peak_date &
percent_variant >= introduction_percent &
percent_variant <= dominant_percent) %>%
group_by(country, variant) %>%
summarise(date_introduction = min(daily_date),
date_dominant = max(daily_date)+7,
date_peak = min(variant_peak_date))
return(var_data)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment