Created
March 2, 2023 11:29
-
-
Save hannesdatta/56f58334da6d1e23d3bf1683a3c0a752 to your computer and use it in GitHub Desktop.
A data cleaning script for demonstration of a setup-input-transformation-output building block
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Setup/initialization | |
library(tidyverse) | |
## Wipe any downloaded files before | |
unlink('*.zip') | |
unlink('*.csv') | |
## Download raw data | |
download.file('https://github.com/hannesdatta/course-dprep/raw/master/content/docs/tutorials/data-preparation/data_without_duplicates.zip', 'data.zip') | |
## Unzip raw data | |
unzip('data.zip') | |
# Input | |
## Load data sets into memory | |
streams <- read_csv('streams.csv') | |
songs <- read_csv('songs.csv') | |
country_codes <- read_delim('country_codes.csv', delim = ';') | |
# Transformation | |
streams_joined = streams %>% | |
left_join(songs, by = c('song_id')) | |
streams_joined %>% left_join(country_codes, | |
by=c('country'='country_code_2_letter')) | |
# Output | |
write_csv(streams_joined, 'streams_merged.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment