Skip to content

Instantly share code, notes, and snippets.

@nacnudus
Created September 15, 2018 10:02
Show Gist options
  • Save nacnudus/77c1e0660176f6822bfe20c8b5313de8 to your computer and use it in GitHub Desktop.
Save nacnudus/77c1e0660176f6822bfe20c8b5313de8 to your computer and use it in GitHub Desktop.
Tidy a spreadsheet of the Luxembourg Time Use Survey with unpivotr
# Inspired by http://www.brodrigues.co/blog/2018-09-11-human_to_machine/
# https://twitter.com/brodriguesco/status/1039604517287931904
# "You can find the data I will use here. Click on the “Time use” folder and you can download the workbook."
# http://statistiques.public.lu/stat/ReportFolders/ReportFolder.aspx?IF_Language=eng&MainTheme=3&FldrName=1&RFPath=14306
library(tidyverse)
library(tidyxl)
library(unpivotr)
library(lubridate)
path <- "./download.xlsx"
formats <- xlsx_formats(path)
cells <-
path %>%
xlsx_cells() %>%
# Drop French and Index sheets
dplyr::filter(str_detect(sheet, "day$")) %>%
# Clean character values
mutate(character = str_trim(character)) %>%
# Drop empty cells
dplyr::filter(data_type != "blank",
!(data_type == "character" && character == "")) %>%
# Drop total rows
dplyr::filter(row <= 58L) %>%
# Separate out the bold categories in the first two columns by budging them to
# the left, then budging everything to the right so that all column numbers
# are positive.
mutate(col = if_else(col %in% 1:2 & formats$local$font$bold[local_format_id],
col - 2L,
col),
col = col + 2L) %>%
# Fix time values expressed as dates rather than character
mutate(character = if_else(data_type == "date", "00:00", character),
data_type = if_else(data_type == "date", "character", data_type))
# Tidy every sheet
tidy_sheet <- function(cells) {
series <- dplyr::filter(cells, row == 1L, col == 1L)$character
cells %>%
dplyr::filter(row >= 2L) %>%
behead("WNW", "activity_category_id") %>%
behead("WNW", "activity_category") %>%
behead("W", "activity_subcategory_id") %>%
behead("W", "activity_subcategory") %>%
behead("NNW", "grouping") %>%
behead("NNW", "group") %>%
behead("NNW", "metric") %>%
behead("N", "unit") %>%
select(-row, -col) %>%
spatter(unit) %>% # like tidyr::spread() by handles mixed data types
mutate(Time = as.integer(as.duration(lubridate::hm(Time))))
}
tidy_data <-
cells %>%
select(sheet, row, col, data_type, character, numeric) %>%
nest(-sheet) %>%
mutate(data = map(data, tidy_sheet)) %>%
unnest()
tidy_data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment