benmarwick · April 25, 2017 01:10
diff --git a/gistfile1.txt b/gistfile1.txt
 # get list of wikileaks files and directories
 library(rvest)
 wl <- read_html("https://file.wikileaks.org/file/")

 # get file/directory names & clean a little
 wllt <- 
  wl %>% 
  # Get text of URLS
  html_nodes("a") %>% 
  html_text() %>% 
  # replace _, - and . with spaces
  gsub("_|-|//.", " ", .) %>% 
  # all lower case
  tolower

 # get a list of country names
 library(countrycode)
 countries <- tolower(countrycode_data$country.name.en)
 # add US and UK in the way that it's used in wikileaks file/folder names
 countries <- c(countries, "us", "uk")

 # compute frequency of countries in file/directory names

 # object to store the results in, a matrix where each row is a country
 # and each column is a wikileaks file/folder name
 output <- matrix(nrow = length(countries),
                 ncol = length(wllt))

 # for each wikilieaks file/folder names, see if each country name is present
 for(i in seq_along(countries)){
  for(j in seq_along(wllt)){
    # to help with debugging
    # print(paste0("now on ", countries[i], " and ", wllt[j]))
    output[i, j] <- grepl(countries[i], wllt[j])
  }
 }

 # make the output a bit easier to read
 output_df <- as.data.frame(output)
 names(output_df) <- wllt 
 row.names(output_df) <- countries

 # tally up the number of times each country is mentioned
 library(dplyr)

 country_freqs <- 
  output_df %>% 
  rowSums() %>% 
  as_data_frame() %>% 
  mutate(country = row.names(output_df)) %>% 
  rename(Freq = value) %>% 
  arrange(desc(Freq)) %>% 
  # only countries mentioned at least twice
  filter(Freq >= 2)

 # plot
 library(ggplot2)
 base_size = 12

 wl_freq_plot <- 
 ggplot(country_freqs, 
       aes(reorder(country, 
                   Freq),
           Freq)) +
  geom_col() +
  coord_flip() +
  theme_bw(base_size = base_size) +
  xlab("Countries mentioned\nat least twice") +
  ylab("Number of Wikileads file/folder-names \ncontaining country name") +
  ggtitle("The US, UK, and Iraq are most frequently\nmentioned in Wikileaks")


 #------------------------------------------------------------------------------

 # Get World Justice Project (WJP) Open Government Index data 
 library(readxl)
 library(httr)

 the_url <- "http://www.worldjusticeproject.org/sites/default/files/wjp-open-gov-2015_data.xlsx"
 GET(the_url, write_disk(the_excel_file <- tempfile(fileext = ".xlsx")))
 og <- read_excel(the_excel_file)
 names(og) <- make.names(names(og), unique = TRUE)

 og_clean <- 
 og %>% 
  rename(countries = NA.,
         openness = Scores) %>% 
  select(countries, openness) %>% 
  mutate(countries = tolower(countries),
         openness = as.numeric(openness)) %>% 
  mutate(countries = if_else(countries == "united states", "us", 
                             if_else(countries == "united kingdom", "uk",
                                     countries)))

 # join with wikileaks data
 wl_and_wjp <- 
 country_freqs %>% 
  left_join(og_clean, 
            by = c('country' = 'countries')) %>% 
  na.omit()

 # plot frequency of wikileaks docs and WJP open governance index
 library(ggrepel)
 library(ggalt)

 wl_and_wjp_all_plot <- 
 ggplot(wl_and_wjp,
       aes(Freq,
           openness)) +
  geom_point(size = 3) +
  geom_point(size = 3, 
             data = subset(wl_and_wjp, 
                           openness < 0.5), 
             colour = "red") +
  geom_text_repel(aes(label = country)) +
  theme_bw(base_size = base_size) +
  labs(x = "Wikileaks items",
       y = "World Justice Project\nOpen Government Index")

 # US is right out there! And UK is out there are bit too. 
 # Let's remove them and see what we have...
 wl_and_wjp %>% 
  filter(!country %in% c("uk", "us")) %>% 
  ggplot(
         aes(Freq,
             openness)) +
  geom_point(size = 3) +
  geom_point(size = 3, 
             data = subset(wl_and_wjp, 
                            openness < 0.5), 
                colour = "red") +
  geom_text_repel(aes(label = country)) +
  theme_bw(base_size = base_size) +
  labs(x = "Wikileaks items",
       y = "World Justice Project\nOpen Government Index")

 # is there a linear relationship between the OGI and the WL count?
 library(ggpmisc)
 formula <- y ~ x

 wl_and_wjp_regression <- 
 wl_and_wjp %>% 
  filter(!country %in% c("uk", "us")) %>% 
  ggplot(
    aes(Freq,
        openness)) +
  geom_ribbon(stat='smooth', 
              method = "lm", 
              formula = formula,
              se=TRUE, 
              alpha=0.05, 
              aes(color = NULL)) +
  geom_line(stat='smooth', 
            method = "lm", 
            alpha=0.3,
            size = 1) +
  stat_poly_eq(aes(label =  paste(..eq.label.., 
                                  ..adj.rr.label..,
                                  sep = "~~~~")),
               formula = formula, 
               rr.digits = 3, 
               coef.digits = 2, 
               parse = TRUE, hjust = -0.25) +
  geom_point(size = 3) +
  geom_point(size = 3, 
             data = subset(wl_and_wjp, 
                           openness < 0.5), 
             colour = "red") +
  geom_text_repel(aes(label = country)) +
  theme_bw(base_size = base_size) +
  labs(x = "Wikileaks items (excluding US & UK)",
       y = "World Justice Project\nOpen Government Index") +
  ggtitle("A very weak relationship between the number of\nWikileaks items and the Open Government Index")

 # not really... what about a difference between countries with OGI < 0.5 and countries with OGI > 0.5?

 library(broom)

 ogi_hi_lo <- 
 wl_and_wjp %>% 
  mutate(ogi = if_else(openness > 0.5, "high", "low")) %>% 
  group_by(ogi) %>% 
  summarise(count_countries = n(),
            total_frequency = sum(Freq))

 ogi_hi_lo_test <- 
  ogi_hi_lo %>% 
  select(-ogi)  %>% 
  chisq.test %>% 
  tidy

 chi_sq_output <- paste0("chi-square = ", 
       round(ogi_hi_lo_test$statistic, 3), ", ",
       "p-value = ",    round(ogi_hi_lo_test$p.value, 3))

 library(ggmosaic)
 library(viridis)

 wl_ogi_rank_plot <- 
 ggplot(ogi_hi_lo) +
  geom_mosaic(aes(weight = total_frequency, 
                  x = product(count_countries), 
                  fill = ogi)) +
  scale_fill_viridis(discrete = TRUE,
                     name = "Open\nGovernment\nIndex rank") +
  xlab("Number of countries in Wikileaks files/folder-names") +
  theme_bw(base_size = base_size)  +
  ggtitle(paste0("Significantly more countries with high\nOpen Governance Index values in Wikileaks \n(", chi_sq_output, ")"))

 # there is a significant difference, WL has significantly MORE documents 
 # for countries with HIGH OGI values. Not what we'd expect if WL is 
 # focused on opening goverment in the most needful cases.


 # here are the main visualisations for the WL and OGI data
 wl_freq_plot
 wl_and_wjp_regression
 wl_ogi_rank_plot


 # Get Transparency International's (TI) Corruption Perceptions Index (CPI) data

 library(readxl)
 library(httr)

 the_url <- "http://files.transparency.org/content/download/2060/13252/file/CPI2016_FullDataSetWithRegionalTables.xlsx"
 GET(the_url, write_disk(the_excel_file <- tempfile(fileext = ".xlsx")))
 ti <- read_excel(the_excel_file)
 names(ti) <- make.names(names(ti), unique = TRUE)

 ti_clean <- 
  ti %>% 
  select(Country, CPI2016) %>% 
  mutate(Country = tolower(Country)) %>% 
  mutate(Country = if_else(Country == "the united states of america", "us", 
                             if_else(Country == "united kingdom", "uk",
                                     Country)))

 # join with wikileaks data
 ti_and_wjp <- 
  country_freqs %>% 
  left_join(ti_clean, 
            by = c('country' = 'Country')) %>% 
  na.omit()

 # plot frequency of wikileaks docs and TI CPI
 library(ggrepel)
 library(ggalt)

 ti_and_wjp_all_plot <- 
  ggplot(ti_and_wjp,
         aes(Freq,
             CPI2016)) +
  geom_point(size = 3) +
  geom_point(size = 3, 
             data = subset(ti_and_wjp, 
                           CPI2016 < 50), 
             colour = "red") +
  geom_text_repel(aes(label = country)) +
  theme_bw(base_size = base_size) +
  labs(x = "Wikileaks items",
       y = "Transparency International\nCorruption Perceptions Index")

 # is there a linear relationship between the CPI and the WL count?
 library(ggpmisc)
 formula <- y ~ x

 ti_and_wjp_others <- 
 ti_and_wjp %>% 
  filter(!country %in% c("uk", "us", "iraq"))  

 ti_and_wjp_regression <- 
  ggplot(ti_and_wjp_others,
    aes(Freq,
        CPI2016)) +
  geom_ribbon(stat='smooth', 
              method = "lm", 
              formula = formula,
              se=TRUE, 
              alpha=0.05, 
              aes(color = NULL)) +
  geom_line(stat='smooth', 
            method = "lm", 
            alpha=0.3,
            size = 1) +
  stat_poly_eq(aes(label =  paste(..eq.label.., 
                                  ..adj.rr.label..,
                                  sep = "~~~~")),
               formula = formula, 
               rr.digits = 3, 
               coef.digits = 2, 
               parse = TRUE, hjust = -0.25) +
  geom_point(size = 3) +
  geom_point(size = 3, 
             data = subset(ti_and_wjp_others, 
                           CPI2016 < 50), 
             colour = "red") +
  geom_text_repel(aes(label = country)) +
  theme_bw(base_size = base_size) +
  labs(x = "Wikileaks items (excluding US, UK & Iraq)",
       y = "Transparency International\nCorruption Perceptions Index") +
  ggtitle("A very weak relationship between the number of\nWikileaks items and the Corruption Perceptions Index")

 # not really... what about a difference between countries with CPI < 50 and countries with CPI > 50?

 library(broom)

 cpi_hi_lo <- 
  ti_and_wjp %>% 
  mutate(cpi = if_else(CPI2016 > 50, "high", "low")) %>% 
  group_by(cpi) %>% 
  summarise(count_countries = n(),
            total_frequency = sum(Freq))

 cpi_hi_lo_test <- 
  cpi_hi_lo %>% 
  select(-cpi)  %>% 
  chisq.test %>% 
  tidy

 chi_sq_output <- paste0("chi-square = ", 
                        round(cpi_hi_lo_test$statistic, 3), ", ",
                        "p-value = ",    round(cpi_hi_lo_test$p.value, 10))

 library(ggmosaic)
 library(viridis)

 wl_cpi_rank_plot <- 
  ggplot(cpi_hi_lo) +
  geom_mosaic(aes(weight = total_frequency, 
                  x = product(count_countries), 
                  fill = cpi)) +
  scale_fill_viridis(discrete = TRUE,
                     name = "Corruption\nPerceptions\nIndex rank") +
  xlab("Number of countries in Wikileaks files/folder-names") +
  theme_bw(base_size = base_size)  +
  ggtitle(paste0("Significantly more countries with\nlow corruption values in Wikileaks \n(", chi_sq_output, ")"))





  
 # ---------------------------------------------------------------------------

 library(readxl)
 library(httr)

 the_url <- "http://www.anderson.ucla.edu/faculty_pages/romain.wacziarg/downloads/fractionalization.xls"
 GET(the_url, write_disk(the_excel_file <- tempfile(fileext = ".xls")))
 ef <- read_excel(the_excel_file, skip = 1)

 ef_clean <- 
  ef %>% 
  select(Country, Ethnic) %>% 
  mutate(Country = tolower(Country),
         Ethnic = as.numeric(Ethnic)) %>% 
  mutate(Country = if_else(Country == "the united states of america", "us", 
                           if_else(Country == "united kingdom", "uk",
                                   Country))) %>% 
  na.omit()

 # see how this correlates with other things...
 ef_and_osi <- 
  left_join(ef_clean, 
            og_clean, 
            by = c("Country" = "countries")) %>% 
  na.omit

 ggplot(ef_and_osi,
       aes(Ethnic, 
           openness)) +
  geom_text(aes(label = Country)) +
  geom_smooth(method = "lm") +
  stat_poly_eq(aes(label =  paste(..eq.label.., 
                                  ..adj.rr.label..,
                                  sep = "~~~~")),
               formula = formula, 
               rr.digits = 3, 
               coef.digits = 2, 
               parse = TRUE, hjust = -0.25) +
  theme_bw()

 ef_and_ti <- 
  left_join(ef_clean, 
            ti_clean) %>% 
  na.omit

 ggplot(ef_and_ti,
       aes(Ethnic, 
           CPI2016)) +
  geom_text(aes(label = Country)) +
  geom_smooth(method = "lm") +
  stat_poly_eq(aes(label =  paste(..eq.label.., 
                                  ..adj.rr.label..,
                                  sep = "~~~~")),
               formula = formula, 
               rr.digits = 3, 
               coef.digits = 2, 
               parse = TRUE, hjust = -0.25) +
  theme_bw()
	# get list of wikileaks files and directories
	library(rvest)
	wl <- read_html("https://file.wikileaks.org/file/")

	# get file/directory names & clean a little
	wllt <-
	wl %>%
	# Get text of URLS
	html_nodes("a") %>%
	html_text() %>%
	# replace _, - and . with spaces
	gsub("_\|-\|//.", " ", .) %>%
	# all lower case
	tolower

	# get a list of country names
	library(countrycode)
	countries <- tolower(countrycode_data$country.name.en)
	# add US and UK in the way that it's used in wikileaks file/folder names
	countries <- c(countries, "us", "uk")

	# compute frequency of countries in file/directory names

	# object to store the results in, a matrix where each row is a country
	# and each column is a wikileaks file/folder name
	output <- matrix(nrow = length(countries),
	ncol = length(wllt))

	# for each wikilieaks file/folder names, see if each country name is present
	for(i in seq_along(countries)){
	for(j in seq_along(wllt)){
	# to help with debugging
	# print(paste0("now on ", countries[i], " and ", wllt[j]))
	output[i, j] <- grepl(countries[i], wllt[j])
	}
	}

	# make the output a bit easier to read
	output_df <- as.data.frame(output)
	names(output_df) <- wllt
	row.names(output_df) <- countries

	# tally up the number of times each country is mentioned
	library(dplyr)

	country_freqs <-
	output_df %>%
	rowSums() %>%
	as_data_frame() %>%
	mutate(country = row.names(output_df)) %>%
	rename(Freq = value) %>%
	arrange(desc(Freq)) %>%
	# only countries mentioned at least twice
	filter(Freq >= 2)

	# plot
	library(ggplot2)
	base_size = 12

	wl_freq_plot <-
	ggplot(country_freqs,
	aes(reorder(country,
	Freq),
	Freq)) +
	geom_col() +
	coord_flip() +
	theme_bw(base_size = base_size) +
	xlab("Countries mentioned\nat least twice") +
	ylab("Number of Wikileads file/folder-names \ncontaining country name") +
	ggtitle("The US, UK, and Iraq are most frequently\nmentioned in Wikileaks")


	#------------------------------------------------------------------------------

	# Get World Justice Project (WJP) Open Government Index data
	library(readxl)
	library(httr)

	the_url <- "http://www.worldjusticeproject.org/sites/default/files/wjp-open-gov-2015_data.xlsx"
	GET(the_url, write_disk(the_excel_file <- tempfile(fileext = ".xlsx")))
	og <- read_excel(the_excel_file)
	names(og) <- make.names(names(og), unique = TRUE)

	og_clean <-
	og %>%
	rename(countries = NA.,
	openness = Scores) %>%
	select(countries, openness) %>%
	mutate(countries = tolower(countries),
	openness = as.numeric(openness)) %>%
	mutate(countries = if_else(countries == "united states", "us",
	if_else(countries == "united kingdom", "uk",
	countries)))

	# join with wikileaks data
	wl_and_wjp <-
	country_freqs %>%
	left_join(og_clean,
	by = c('country' = 'countries')) %>%
	na.omit()

	# plot frequency of wikileaks docs and WJP open governance index
	library(ggrepel)
	library(ggalt)

	wl_and_wjp_all_plot <-
	ggplot(wl_and_wjp,
	aes(Freq,
	openness)) +
	geom_point(size = 3) +
	geom_point(size = 3,
	data = subset(wl_and_wjp,
	openness < 0.5),
	colour = "red") +
	geom_text_repel(aes(label = country)) +
	theme_bw(base_size = base_size) +
	labs(x = "Wikileaks items",
	y = "World Justice Project\nOpen Government Index")

	# US is right out there! And UK is out there are bit too.
	# Let's remove them and see what we have...
	wl_and_wjp %>%
	filter(!country %in% c("uk", "us")) %>%
	ggplot(
	aes(Freq,
	openness)) +
	geom_point(size = 3) +
	geom_point(size = 3,
	data = subset(wl_and_wjp,
	openness < 0.5),
	colour = "red") +
	geom_text_repel(aes(label = country)) +
	theme_bw(base_size = base_size) +
	labs(x = "Wikileaks items",
	y = "World Justice Project\nOpen Government Index")

	# is there a linear relationship between the OGI and the WL count?
	library(ggpmisc)
	formula <- y ~ x

	wl_and_wjp_regression <-
	wl_and_wjp %>%
	filter(!country %in% c("uk", "us")) %>%
	ggplot(
	aes(Freq,
	openness)) +
	geom_ribbon(stat='smooth',
	method = "lm",
	formula = formula,
	se=TRUE,
	alpha=0.05,
	aes(color = NULL)) +
	geom_line(stat='smooth',
	method = "lm",
	alpha=0.3,
	size = 1) +
	stat_poly_eq(aes(label = paste(..eq.label..,
	..adj.rr.label..,
	sep = "~~~~")),
	formula = formula,
	rr.digits = 3,
	coef.digits = 2,
	parse = TRUE, hjust = -0.25) +
	geom_point(size = 3) +
	geom_point(size = 3,
	data = subset(wl_and_wjp,
	openness < 0.5),
	colour = "red") +
	geom_text_repel(aes(label = country)) +
	theme_bw(base_size = base_size) +
	labs(x = "Wikileaks items (excluding US & UK)",
	y = "World Justice Project\nOpen Government Index") +
	ggtitle("A very weak relationship between the number of\nWikileaks items and the Open Government Index")

	# not really... what about a difference between countries with OGI < 0.5 and countries with OGI > 0.5?

	library(broom)

	ogi_hi_lo <-
	wl_and_wjp %>%
	mutate(ogi = if_else(openness > 0.5, "high", "low")) %>%
	group_by(ogi) %>%
	summarise(count_countries = n(),
	total_frequency = sum(Freq))

	ogi_hi_lo_test <-
	ogi_hi_lo %>%
	select(-ogi) %>%
	chisq.test %>%
	tidy

	chi_sq_output <- paste0("chi-square = ",
	round(ogi_hi_lo_test$statistic, 3), ", ",
	"p-value = ", round(ogi_hi_lo_test$p.value, 3))

	library(ggmosaic)
	library(viridis)

	wl_ogi_rank_plot <-
	ggplot(ogi_hi_lo) +
	geom_mosaic(aes(weight = total_frequency,
	x = product(count_countries),
	fill = ogi)) +
	scale_fill_viridis(discrete = TRUE,
	name = "Open\nGovernment\nIndex rank") +
	xlab("Number of countries in Wikileaks files/folder-names") +
	theme_bw(base_size = base_size) +
	ggtitle(paste0("Significantly more countries with high\nOpen Governance Index values in Wikileaks \n(", chi_sq_output, ")"))

	# there is a significant difference, WL has significantly MORE documents
	# for countries with HIGH OGI values. Not what we'd expect if WL is
	# focused on opening goverment in the most needful cases.


	# here are the main visualisations for the WL and OGI data
	wl_freq_plot
	wl_and_wjp_regression
	wl_ogi_rank_plot


	# Get Transparency International's (TI) Corruption Perceptions Index (CPI) data

	library(readxl)
	library(httr)

	the_url <- "http://files.transparency.org/content/download/2060/13252/file/CPI2016_FullDataSetWithRegionalTables.xlsx"
	GET(the_url, write_disk(the_excel_file <- tempfile(fileext = ".xlsx")))
	ti <- read_excel(the_excel_file)
	names(ti) <- make.names(names(ti), unique = TRUE)

	ti_clean <-
	ti %>%
	select(Country, CPI2016) %>%
	mutate(Country = tolower(Country)) %>%
	mutate(Country = if_else(Country == "the united states of america", "us",
	if_else(Country == "united kingdom", "uk",
	Country)))

	# join with wikileaks data
	ti_and_wjp <-
	country_freqs %>%
	left_join(ti_clean,
	by = c('country' = 'Country')) %>%
	na.omit()

	# plot frequency of wikileaks docs and TI CPI
	library(ggrepel)
	library(ggalt)

	ti_and_wjp_all_plot <-
	ggplot(ti_and_wjp,
	aes(Freq,
	CPI2016)) +
	geom_point(size = 3) +
	geom_point(size = 3,
	data = subset(ti_and_wjp,
	CPI2016 < 50),
	colour = "red") +
	geom_text_repel(aes(label = country)) +
	theme_bw(base_size = base_size) +
	labs(x = "Wikileaks items",
	y = "Transparency International\nCorruption Perceptions Index")

	# is there a linear relationship between the CPI and the WL count?
	library(ggpmisc)
	formula <- y ~ x

	ti_and_wjp_others <-
	ti_and_wjp %>%
	filter(!country %in% c("uk", "us", "iraq"))

	ti_and_wjp_regression <-
	ggplot(ti_and_wjp_others,
	aes(Freq,
	CPI2016)) +
	geom_ribbon(stat='smooth',
	method = "lm",
	formula = formula,
	se=TRUE,
	alpha=0.05,
	aes(color = NULL)) +
	geom_line(stat='smooth',
	method = "lm",
	alpha=0.3,
	size = 1) +
	stat_poly_eq(aes(label = paste(..eq.label..,
	..adj.rr.label..,
	sep = "~~~~")),
	formula = formula,
	rr.digits = 3,
	coef.digits = 2,
	parse = TRUE, hjust = -0.25) +
	geom_point(size = 3) +
	geom_point(size = 3,
	data = subset(ti_and_wjp_others,
	CPI2016 < 50),
	colour = "red") +
	geom_text_repel(aes(label = country)) +
	theme_bw(base_size = base_size) +
	labs(x = "Wikileaks items (excluding US, UK & Iraq)",
	y = "Transparency International\nCorruption Perceptions Index") +
	ggtitle("A very weak relationship between the number of\nWikileaks items and the Corruption Perceptions Index")

	# not really... what about a difference between countries with CPI < 50 and countries with CPI > 50?

	library(broom)

	cpi_hi_lo <-
	ti_and_wjp %>%
	mutate(cpi = if_else(CPI2016 > 50, "high", "low")) %>%
	group_by(cpi) %>%
	summarise(count_countries = n(),
	total_frequency = sum(Freq))

	cpi_hi_lo_test <-
	cpi_hi_lo %>%
	select(-cpi) %>%
	chisq.test %>%
	tidy

	chi_sq_output <- paste0("chi-square = ",
	round(cpi_hi_lo_test$statistic, 3), ", ",
	"p-value = ", round(cpi_hi_lo_test$p.value, 10))

	library(ggmosaic)
	library(viridis)

	wl_cpi_rank_plot <-
	ggplot(cpi_hi_lo) +
	geom_mosaic(aes(weight = total_frequency,
	x = product(count_countries),
	fill = cpi)) +
	scale_fill_viridis(discrete = TRUE,
	name = "Corruption\nPerceptions\nIndex rank") +
	xlab("Number of countries in Wikileaks files/folder-names") +
	theme_bw(base_size = base_size) +
	ggtitle(paste0("Significantly more countries with\nlow corruption values in Wikileaks \n(", chi_sq_output, ")"))






	# ---------------------------------------------------------------------------

	library(readxl)
	library(httr)

	the_url <- "http://www.anderson.ucla.edu/faculty_pages/romain.wacziarg/downloads/fractionalization.xls"
	GET(the_url, write_disk(the_excel_file <- tempfile(fileext = ".xls")))
	ef <- read_excel(the_excel_file, skip = 1)

	ef_clean <-
	ef %>%
	select(Country, Ethnic) %>%
	mutate(Country = tolower(Country),
	Ethnic = as.numeric(Ethnic)) %>%
	mutate(Country = if_else(Country == "the united states of america", "us",
	if_else(Country == "united kingdom", "uk",
	Country))) %>%
	na.omit()

	# see how this correlates with other things...
	ef_and_osi <-
	left_join(ef_clean,
	og_clean,
	by = c("Country" = "countries")) %>%
	na.omit

	ggplot(ef_and_osi,
	aes(Ethnic,
	openness)) +
	geom_text(aes(label = Country)) +
	geom_smooth(method = "lm") +
	stat_poly_eq(aes(label = paste(..eq.label..,
	..adj.rr.label..,
	sep = "~~~~")),
	formula = formula,
	rr.digits = 3,
	coef.digits = 2,
	parse = TRUE, hjust = -0.25) +
	theme_bw()

	ef_and_ti <-
	left_join(ef_clean,
	ti_clean) %>%
	na.omit

	ggplot(ef_and_ti,
	aes(Ethnic,
	CPI2016)) +
	geom_text(aes(label = Country)) +
	geom_smooth(method = "lm") +
	stat_poly_eq(aes(label = paste(..eq.label..,
	..adj.rr.label..,
	sep = "~~~~")),
	formula = formula,
	rr.digits = 3,
	coef.digits = 2,
	parse = TRUE, hjust = -0.25) +
	theme_bw()