tmasjc · February 2, 2018 13:27 · tmasjc · Feb 2, 2018
diff --git a/unwto.R b/unwto.R
 # China Outbound Tourism
 library(readxl)
 library(tidyr)
 library(ggplot2)
 library(broom)

 # read data
 cot <- read_excel("~/Downloads/outbound_tourism_china.xlsx", skip = 5)
 # mod columns name
 colnames(cot) <- c("country", "series", 1995:2016, "pct_chg_16_15")

 # set ggplot2 palette
 old <- theme_set(theme_minimal())
 pal = scale_color_brewer(palette = "Set3")

 # Which series has the most complete data? (TFR)
 cot %>% group_by(series) %>% summarise(n = n()) %>% arrange(-n)


 # Convert to tidy format
 long <- cot %>% 
  filter(series == "TFR") %>% 
  select(-pct_chg_16_15, -series) %>% 
  gather(year, arrivals, -country)
 long
 ## TODO: How To Select A Series?

 # Which country left?
 long %>% 
  group_by(country) %>% 
  summarise(sum = sum(arrivals, na.rm = TRUE)) %>% 
  arrange(-sum)


 # Which countries are the most popular? ---------------------------------------


 # Top 10 country in 2016
 top10_2016 <- long %>% filter(year == 2016) %>% top_n(wt = arrivals, n = 10) %>% pull(country)
 top10_2016

 df <- long %>% filter(country %in% top10_2016) 
 df

 df %>% 
  ggplot(aes(year, arrivals, col = country, group = country)) +
  geom_line() + 
  theme(legend.position = "bottom") +
  pal

 df %>% 
  ggplot(aes(year, arrivals, col = country, group = country)) +
  geom_line() + 
  scale_y_log10() +
  theme(legend.position = "bottom") +
  pal

 df %>% 
  ggplot(aes(year, arrivals, col = country, group = country)) +
  geom_smooth(se = FALSE) +
  scale_y_log10() +
  theme(legend.position = "bottom") + 
  pal



 # Which countries grow the fastest? -----------------------------------------


 long <- long %>% 
  group_by(country) %>% 
  arrange(year) %>% 
  mutate(growth = (arrivals - lag(arrivals)) / lag(arrivals))
 long %>% arrange(country, year)

 top20_growth <- long %>% 
  summarise(growth = mean(growth, na.rm = TRUE)) %>% 
  top_n(n = 20, wt = growth) %>% 
  pull(country)
 top20_growth

 long %>% 
  summarise(growth = mean(growth, na.rm = TRUE)) %>% 
  top_n(n = 20, wt = growth) %>% 
  ggplot(aes(country, growth)) +
  geom_point(aes(col = country)) +
  scale_color_discrete(guide = "none") +
  theme(axis.text.x = element_text(angle = 60))

 long %>% 
  filter(country %in% top20_growth) %>% 
  ggplot(aes(year, arrivals, col = country, group = country)) +
  geom_line() + 
  scale_y_log10() +
  theme(legend.position = "bottom")

 long %>% 
  filter(country %in% top20_growth) %>% 
  ggplot(aes(year, arrivals, col = country, group = country)) +
  geom_smooth(se = FALSE) +
  scale_y_log10() +
  theme(legend.position = "bottom")



 # Measure goodness of fit -------------------------------------------------

 long

 mods <- long %>% 
  group_by(country) %>% 
  nest() %>% 
  # do lm() on every row 
  mutate(model = purrr::map(data, ~lm(log(arrivals) ~ as.integer(year), data = .)))
 mods

 # use broom here
 mods_sum <- mods %>% unnest(model %>% purrr::map(glance))
 mods_sum

 # compare r2
 p <- mods_sum %>% 
  ggplot(aes(r.squared, reorder(country, r.squared))) + 
  geom_point()  
 p
  

 # use USA as a benchmark
 usa.row <- mods_sum %>% 
  arrange(-r.squared) %>% 
  with(., which(country == "United States of America"))


 p + geom_hline(yintercept = usa.row-1, col = "salmon", lty = 5)
	# China Outbound Tourism
	library(readxl)
	library(tidyr)
	library(ggplot2)
	library(broom)

	# read data
	cot <- read_excel("~/Downloads/outbound_tourism_china.xlsx", skip = 5)
	# mod columns name
	colnames(cot) <- c("country", "series", 1995:2016, "pct_chg_16_15")

	# set ggplot2 palette
	old <- theme_set(theme_minimal())
	pal = scale_color_brewer(palette = "Set3")

	# Which series has the most complete data? (TFR)
	cot %>% group_by(series) %>% summarise(n = n()) %>% arrange(-n)


	# Convert to tidy format
	long <- cot %>%
	filter(series == "TFR") %>%
	select(-pct_chg_16_15, -series) %>%
	gather(year, arrivals, -country)
	long
	## TODO: How To Select A Series?

	# Which country left?
	long %>%
	group_by(country) %>%
	summarise(sum = sum(arrivals, na.rm = TRUE)) %>%
	arrange(-sum)


	# Which countries are the most popular? ---------------------------------------


	# Top 10 country in 2016
	top10_2016 <- long %>% filter(year == 2016) %>% top_n(wt = arrivals, n = 10) %>% pull(country)
	top10_2016

	df <- long %>% filter(country %in% top10_2016)
	df

	df %>%
	ggplot(aes(year, arrivals, col = country, group = country)) +
	geom_line() +
	theme(legend.position = "bottom") +
	pal

	df %>%
	ggplot(aes(year, arrivals, col = country, group = country)) +
	geom_line() +
	scale_y_log10() +
	theme(legend.position = "bottom") +
	pal

	df %>%
	ggplot(aes(year, arrivals, col = country, group = country)) +
	geom_smooth(se = FALSE) +
	scale_y_log10() +
	theme(legend.position = "bottom") +
	pal



	# Which countries grow the fastest? -----------------------------------------


	long <- long %>%
	group_by(country) %>%
	arrange(year) %>%
	mutate(growth = (arrivals - lag(arrivals)) / lag(arrivals))
	long %>% arrange(country, year)

	top20_growth <- long %>%
	summarise(growth = mean(growth, na.rm = TRUE)) %>%
	top_n(n = 20, wt = growth) %>%
	pull(country)
	top20_growth

	long %>%
	summarise(growth = mean(growth, na.rm = TRUE)) %>%
	top_n(n = 20, wt = growth) %>%
	ggplot(aes(country, growth)) +
	geom_point(aes(col = country)) +
	scale_color_discrete(guide = "none") +
	theme(axis.text.x = element_text(angle = 60))

	long %>%
	filter(country %in% top20_growth) %>%
	ggplot(aes(year, arrivals, col = country, group = country)) +
	geom_line() +
	scale_y_log10() +
	theme(legend.position = "bottom")

	long %>%
	filter(country %in% top20_growth) %>%
	ggplot(aes(year, arrivals, col = country, group = country)) +
	geom_smooth(se = FALSE) +
	scale_y_log10() +
	theme(legend.position = "bottom")



	# Measure goodness of fit -------------------------------------------------

	long

	mods <- long %>%
	group_by(country) %>%
	nest() %>%
	# do lm() on every row
	mutate(model = purrr::map(data, ~lm(log(arrivals) ~ as.integer(year), data = .)))
	mods

	# use broom here
	mods_sum <- mods %>% unnest(model %>% purrr::map(glance))
	mods_sum

	# compare r2
	p <- mods_sum %>%
	ggplot(aes(r.squared, reorder(country, r.squared))) +
	geom_point()
	p


	# use USA as a benchmark
	usa.row <- mods_sum %>%
	arrange(-r.squared) %>%
	with(., which(country == "United States of America"))


	p + geom_hline(yintercept = usa.row-1, col = "salmon", lty = 5)