Created
February 16, 2023 11:37
-
-
Save hannesdatta/1524d21d14cecf46d0eab79a03aeb448 to your computer and use it in GitHub Desktop.
dprep-exercises-2023-02-16
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "dPrep Tutorial" | |
output: html_document | |
date: "2023-02-16" | |
--- | |
```{r setup, include=FALSE} | |
knitr::opts_chunk$set(echo = TRUE) | |
``` | |
## Exercises | |
### Exercise 1: Conditionals | |
```{r, echo=FALSE} | |
# Initialization of the data | |
x <- c(10, 20, NA, 5, 3, 100) | |
# 1) all values equal to 10 | |
# indexing which one is 10 | |
x==10 | |
# wrapping it into a which statement --> to get index NUMBERS | |
which(x==10) | |
# using it on x as an index vector | |
x[which(x==10)] | |
# 2) all values not equal to 10 | |
x!=10 | |
which(x!=10) | |
x[which(x!=10)] | |
x[x!=10] | |
x[which(x!=10)] | |
# 3) values larger than 20 | |
x[which(x>20)] | |
# 4) values larger than 20 | |
x[which(x<10)] | |
# 5) counting number of missing values | |
# (remember to build code gradually) | |
is.na(x) | |
data.frame(x, is.na(x)) | |
# one solution: summing up | |
sum(is.na(x)) | |
# second solution | |
which(is.na(x)) # which items are missing? # the third! | |
length(which(is.na(x))) | |
# 6) all values larger than 20 OR missing values | |
larger_than_20 = x > 20 | |
is_na = is.na(x) | |
combined = larger_than_20 | is_na | |
data.frame(x, larger_than_20, | |
is_na, | |
combined) | |
x[combined] | |
# 7) & | |
first = x > 5 | |
second = x < 20 | |
combination = first & second | |
data.frame(x, first, second, combination) | |
x[combination] # w/ NAs | |
x[which(combination)] # without NAs | |
``` | |
## Exercise 2: Controls | |
```{r} | |
download.file("https://raw.githubusercontent.com/hannesdatta/course-dprep/master/content/docs/modules/week4/regional-global-daily-latest.csv", "streams.csv") | |
library(tidyverse) | |
streams <- read_csv('streams.csv', skip=1, n_max = Inf) | |
``` | |
```{r warning=FALSE} | |
download.file("https://raw.githubusercontent.com/hannesdatta/course-dprep/master/content/docs/modules/week4/regional-global-daily-latest.csv", "streams.csv") | |
library(tidyverse) | |
prototype = F | |
nrows = Inf | |
if (prototype==T) nrows = 100 | |
streams <- read_csv('streams.csv', skip=1, n_max = nrows) | |
nrow(streams) | |
``` | |
```{r} | |
# another (less optimal solution) | |
prototype = T | |
if (prototype==T) { | |
streams <- read_csv('streams.csv', skip=1, n_max = 100) } else { | |
streams <- read_csv('streams.csv', skip=1, | |
n_max = Inf) | |
} | |
nrow(streams) | |
``` | |
```{r} | |
urls = c('http://data.insideairbnb.com/spain/catalonia/barcelona/2022-12-11/visualisations/listings.csv', | |
'http://data.insideairbnb.com/spain/catalonia/barcelona/2022-09-10/visualisations/listings.csv', | |
'http://data.insideairbnb.com/spain/catalonia/barcelona/2022-06-07/visualisations/listings.csv') | |
for (url in urls) { | |
filename = paste(gsub('[^a-zA-Z]', '', url), '.csv') # keep only letter | |
filename = gsub('httpdatainsideairbnbcom', '', filename) # wipe httpdatainsideairbnbcom from filename | |
download.file(url, destfile = filename) # download file | |
} | |
``` | |
### Use a lapply loop to load data sets | |
```{r} | |
urls = c('http://data.insideairbnb.com/spain/catalonia/barcelona/2022-12-11/visualisations/listings.csv', | |
'http://data.insideairbnb.com/spain/catalonia/barcelona/2022-09-10/visualisations/listings.csv', | |
'http://data.insideairbnb.com/spain/catalonia/barcelona/2022-06-07/visualisations/listings.csv') | |
datasets = lapply(urls, read_csv) | |
final_data = bind_rows(datasets) | |
``` | |
#### very very bad solution | |
```{r} | |
datasets1 = read_csv('http://data.insideairbnb.com/spain/catalonia/barcelona/2022-12-11/visualisations/listings.csv') | |
datasets2 = read_csv('http://data.insideairbnb.com/spain/catalonia/barcelona/2022-09-10/visualisations/listings.csv') | |
nrow(datasets1) | |
nrow(datasets2) | |
``` | |
## Exploration | |
```{r} | |
final_data %>% count() | |
``` | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment