Created
February 2, 2018 13:27
-
-
Save tmasjc/e1c556e55f6f76dc62c460114b5fc140 to your computer and use it in GitHub Desktop.
Explore chinese outbound tourism data from UNWTO. #rstats #unwto
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# China Outbound Tourism | |
library(readxl) | |
library(tidyr) | |
library(ggplot2) | |
library(broom) | |
# read data | |
cot <- read_excel("~/Downloads/outbound_tourism_china.xlsx", skip = 5) | |
# mod columns name | |
colnames(cot) <- c("country", "series", 1995:2016, "pct_chg_16_15") | |
# set ggplot2 palette | |
old <- theme_set(theme_minimal()) | |
pal = scale_color_brewer(palette = "Set3") | |
# Which series has the most complete data? (TFR) | |
cot %>% group_by(series) %>% summarise(n = n()) %>% arrange(-n) | |
# Convert to tidy format | |
long <- cot %>% | |
filter(series == "TFR") %>% | |
select(-pct_chg_16_15, -series) %>% | |
gather(year, arrivals, -country) | |
long | |
## TODO: How To Select A Series? | |
# Which country left? | |
long %>% | |
group_by(country) %>% | |
summarise(sum = sum(arrivals, na.rm = TRUE)) %>% | |
arrange(-sum) | |
# Which countries are the most popular? --------------------------------------- | |
# Top 10 country in 2016 | |
top10_2016 <- long %>% filter(year == 2016) %>% top_n(wt = arrivals, n = 10) %>% pull(country) | |
top10_2016 | |
df <- long %>% filter(country %in% top10_2016) | |
df | |
df %>% | |
ggplot(aes(year, arrivals, col = country, group = country)) + | |
geom_line() + | |
theme(legend.position = "bottom") + | |
pal | |
df %>% | |
ggplot(aes(year, arrivals, col = country, group = country)) + | |
geom_line() + | |
scale_y_log10() + | |
theme(legend.position = "bottom") + | |
pal | |
df %>% | |
ggplot(aes(year, arrivals, col = country, group = country)) + | |
geom_smooth(se = FALSE) + | |
scale_y_log10() + | |
theme(legend.position = "bottom") + | |
pal | |
# Which countries grow the fastest? ----------------------------------------- | |
long <- long %>% | |
group_by(country) %>% | |
arrange(year) %>% | |
mutate(growth = (arrivals - lag(arrivals)) / lag(arrivals)) | |
long %>% arrange(country, year) | |
top20_growth <- long %>% | |
summarise(growth = mean(growth, na.rm = TRUE)) %>% | |
top_n(n = 20, wt = growth) %>% | |
pull(country) | |
top20_growth | |
long %>% | |
summarise(growth = mean(growth, na.rm = TRUE)) %>% | |
top_n(n = 20, wt = growth) %>% | |
ggplot(aes(country, growth)) + | |
geom_point(aes(col = country)) + | |
scale_color_discrete(guide = "none") + | |
theme(axis.text.x = element_text(angle = 60)) | |
long %>% | |
filter(country %in% top20_growth) %>% | |
ggplot(aes(year, arrivals, col = country, group = country)) + | |
geom_line() + | |
scale_y_log10() + | |
theme(legend.position = "bottom") | |
long %>% | |
filter(country %in% top20_growth) %>% | |
ggplot(aes(year, arrivals, col = country, group = country)) + | |
geom_smooth(se = FALSE) + | |
scale_y_log10() + | |
theme(legend.position = "bottom") | |
# Measure goodness of fit ------------------------------------------------- | |
long | |
mods <- long %>% | |
group_by(country) %>% | |
nest() %>% | |
# do lm() on every row | |
mutate(model = purrr::map(data, ~lm(log(arrivals) ~ as.integer(year), data = .))) | |
mods | |
# use broom here | |
mods_sum <- mods %>% unnest(model %>% purrr::map(glance)) | |
mods_sum | |
# compare r2 | |
p <- mods_sum %>% | |
ggplot(aes(r.squared, reorder(country, r.squared))) + | |
geom_point() | |
p | |
# use USA as a benchmark | |
usa.row <- mods_sum %>% | |
arrange(-r.squared) %>% | |
with(., which(country == "United States of America")) | |
p + geom_hline(yintercept = usa.row-1, col = "salmon", lty = 5) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@ashleylijie