Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Last active April 25, 2017 01:10
Show Gist options
  • Save benmarwick/c6f3226a21e4f44c81a643b81e3ca426 to your computer and use it in GitHub Desktop.
Save benmarwick/c6f3226a21e4f44c81a643b81e3ca426 to your computer and use it in GitHub Desktop.
# get list of wikileaks files and directories
library(rvest)
wl <- read_html("https://file.wikileaks.org/file/")
# get file/directory names & clean a little
wllt <-
wl %>%
# Get text of URLS
html_nodes("a") %>%
html_text() %>%
# replace _, - and . with spaces
gsub("_|-|//.", " ", .) %>%
# all lower case
tolower
# get a list of country names
library(countrycode)
countries <- tolower(countrycode_data$country.name.en)
# add US and UK in the way that it's used in wikileaks file/folder names
countries <- c(countries, "us", "uk")
# compute frequency of countries in file/directory names
# object to store the results in, a matrix where each row is a country
# and each column is a wikileaks file/folder name
output <- matrix(nrow = length(countries),
ncol = length(wllt))
# for each wikilieaks file/folder names, see if each country name is present
for(i in seq_along(countries)){
for(j in seq_along(wllt)){
# to help with debugging
# print(paste0("now on ", countries[i], " and ", wllt[j]))
output[i, j] <- grepl(countries[i], wllt[j])
}
}
# make the output a bit easier to read
output_df <- as.data.frame(output)
names(output_df) <- wllt
row.names(output_df) <- countries
# tally up the number of times each country is mentioned
library(dplyr)
country_freqs <-
output_df %>%
rowSums() %>%
as_data_frame() %>%
mutate(country = row.names(output_df)) %>%
rename(Freq = value) %>%
arrange(desc(Freq)) %>%
# only countries mentioned at least twice
filter(Freq >= 2)
# plot
library(ggplot2)
base_size = 12
wl_freq_plot <-
ggplot(country_freqs,
aes(reorder(country,
Freq),
Freq)) +
geom_col() +
coord_flip() +
theme_bw(base_size = base_size) +
xlab("Countries mentioned\nat least twice") +
ylab("Number of Wikileads file/folder-names \ncontaining country name") +
ggtitle("The US, UK, and Iraq are most frequently\nmentioned in Wikileaks")
#------------------------------------------------------------------------------
# Get World Justice Project (WJP) Open Government Index data
library(readxl)
library(httr)
the_url <- "http://www.worldjusticeproject.org/sites/default/files/wjp-open-gov-2015_data.xlsx"
GET(the_url, write_disk(the_excel_file <- tempfile(fileext = ".xlsx")))
og <- read_excel(the_excel_file)
names(og) <- make.names(names(og), unique = TRUE)
og_clean <-
og %>%
rename(countries = NA.,
openness = Scores) %>%
select(countries, openness) %>%
mutate(countries = tolower(countries),
openness = as.numeric(openness)) %>%
mutate(countries = if_else(countries == "united states", "us",
if_else(countries == "united kingdom", "uk",
countries)))
# join with wikileaks data
wl_and_wjp <-
country_freqs %>%
left_join(og_clean,
by = c('country' = 'countries')) %>%
na.omit()
# plot frequency of wikileaks docs and WJP open governance index
library(ggrepel)
library(ggalt)
wl_and_wjp_all_plot <-
ggplot(wl_and_wjp,
aes(Freq,
openness)) +
geom_point(size = 3) +
geom_point(size = 3,
data = subset(wl_and_wjp,
openness < 0.5),
colour = "red") +
geom_text_repel(aes(label = country)) +
theme_bw(base_size = base_size) +
labs(x = "Wikileaks items",
y = "World Justice Project\nOpen Government Index")
# US is right out there! And UK is out there are bit too.
# Let's remove them and see what we have...
wl_and_wjp %>%
filter(!country %in% c("uk", "us")) %>%
ggplot(
aes(Freq,
openness)) +
geom_point(size = 3) +
geom_point(size = 3,
data = subset(wl_and_wjp,
openness < 0.5),
colour = "red") +
geom_text_repel(aes(label = country)) +
theme_bw(base_size = base_size) +
labs(x = "Wikileaks items",
y = "World Justice Project\nOpen Government Index")
# is there a linear relationship between the OGI and the WL count?
library(ggpmisc)
formula <- y ~ x
wl_and_wjp_regression <-
wl_and_wjp %>%
filter(!country %in% c("uk", "us")) %>%
ggplot(
aes(Freq,
openness)) +
geom_ribbon(stat='smooth',
method = "lm",
formula = formula,
se=TRUE,
alpha=0.05,
aes(color = NULL)) +
geom_line(stat='smooth',
method = "lm",
alpha=0.3,
size = 1) +
stat_poly_eq(aes(label = paste(..eq.label..,
..adj.rr.label..,
sep = "~~~~")),
formula = formula,
rr.digits = 3,
coef.digits = 2,
parse = TRUE, hjust = -0.25) +
geom_point(size = 3) +
geom_point(size = 3,
data = subset(wl_and_wjp,
openness < 0.5),
colour = "red") +
geom_text_repel(aes(label = country)) +
theme_bw(base_size = base_size) +
labs(x = "Wikileaks items (excluding US & UK)",
y = "World Justice Project\nOpen Government Index") +
ggtitle("A very weak relationship between the number of\nWikileaks items and the Open Government Index")
# not really... what about a difference between countries with OGI < 0.5 and countries with OGI > 0.5?
library(broom)
ogi_hi_lo <-
wl_and_wjp %>%
mutate(ogi = if_else(openness > 0.5, "high", "low")) %>%
group_by(ogi) %>%
summarise(count_countries = n(),
total_frequency = sum(Freq))
ogi_hi_lo_test <-
ogi_hi_lo %>%
select(-ogi) %>%
chisq.test %>%
tidy
chi_sq_output <- paste0("chi-square = ",
round(ogi_hi_lo_test$statistic, 3), ", ",
"p-value = ", round(ogi_hi_lo_test$p.value, 3))
library(ggmosaic)
library(viridis)
wl_ogi_rank_plot <-
ggplot(ogi_hi_lo) +
geom_mosaic(aes(weight = total_frequency,
x = product(count_countries),
fill = ogi)) +
scale_fill_viridis(discrete = TRUE,
name = "Open\nGovernment\nIndex rank") +
xlab("Number of countries in Wikileaks files/folder-names") +
theme_bw(base_size = base_size) +
ggtitle(paste0("Significantly more countries with high\nOpen Governance Index values in Wikileaks \n(", chi_sq_output, ")"))
# there is a significant difference, WL has significantly MORE documents
# for countries with HIGH OGI values. Not what we'd expect if WL is
# focused on opening goverment in the most needful cases.
# here are the main visualisations for the WL and OGI data
wl_freq_plot
wl_and_wjp_regression
wl_ogi_rank_plot
# Get Transparency International's (TI) Corruption Perceptions Index (CPI) data
library(readxl)
library(httr)
the_url <- "http://files.transparency.org/content/download/2060/13252/file/CPI2016_FullDataSetWithRegionalTables.xlsx"
GET(the_url, write_disk(the_excel_file <- tempfile(fileext = ".xlsx")))
ti <- read_excel(the_excel_file)
names(ti) <- make.names(names(ti), unique = TRUE)
ti_clean <-
ti %>%
select(Country, CPI2016) %>%
mutate(Country = tolower(Country)) %>%
mutate(Country = if_else(Country == "the united states of america", "us",
if_else(Country == "united kingdom", "uk",
Country)))
# join with wikileaks data
ti_and_wjp <-
country_freqs %>%
left_join(ti_clean,
by = c('country' = 'Country')) %>%
na.omit()
# plot frequency of wikileaks docs and TI CPI
library(ggrepel)
library(ggalt)
ti_and_wjp_all_plot <-
ggplot(ti_and_wjp,
aes(Freq,
CPI2016)) +
geom_point(size = 3) +
geom_point(size = 3,
data = subset(ti_and_wjp,
CPI2016 < 50),
colour = "red") +
geom_text_repel(aes(label = country)) +
theme_bw(base_size = base_size) +
labs(x = "Wikileaks items",
y = "Transparency International\nCorruption Perceptions Index")
# is there a linear relationship between the CPI and the WL count?
library(ggpmisc)
formula <- y ~ x
ti_and_wjp_others <-
ti_and_wjp %>%
filter(!country %in% c("uk", "us", "iraq"))
ti_and_wjp_regression <-
ggplot(ti_and_wjp_others,
aes(Freq,
CPI2016)) +
geom_ribbon(stat='smooth',
method = "lm",
formula = formula,
se=TRUE,
alpha=0.05,
aes(color = NULL)) +
geom_line(stat='smooth',
method = "lm",
alpha=0.3,
size = 1) +
stat_poly_eq(aes(label = paste(..eq.label..,
..adj.rr.label..,
sep = "~~~~")),
formula = formula,
rr.digits = 3,
coef.digits = 2,
parse = TRUE, hjust = -0.25) +
geom_point(size = 3) +
geom_point(size = 3,
data = subset(ti_and_wjp_others,
CPI2016 < 50),
colour = "red") +
geom_text_repel(aes(label = country)) +
theme_bw(base_size = base_size) +
labs(x = "Wikileaks items (excluding US, UK & Iraq)",
y = "Transparency International\nCorruption Perceptions Index") +
ggtitle("A very weak relationship between the number of\nWikileaks items and the Corruption Perceptions Index")
# not really... what about a difference between countries with CPI < 50 and countries with CPI > 50?
library(broom)
cpi_hi_lo <-
ti_and_wjp %>%
mutate(cpi = if_else(CPI2016 > 50, "high", "low")) %>%
group_by(cpi) %>%
summarise(count_countries = n(),
total_frequency = sum(Freq))
cpi_hi_lo_test <-
cpi_hi_lo %>%
select(-cpi) %>%
chisq.test %>%
tidy
chi_sq_output <- paste0("chi-square = ",
round(cpi_hi_lo_test$statistic, 3), ", ",
"p-value = ", round(cpi_hi_lo_test$p.value, 10))
library(ggmosaic)
library(viridis)
wl_cpi_rank_plot <-
ggplot(cpi_hi_lo) +
geom_mosaic(aes(weight = total_frequency,
x = product(count_countries),
fill = cpi)) +
scale_fill_viridis(discrete = TRUE,
name = "Corruption\nPerceptions\nIndex rank") +
xlab("Number of countries in Wikileaks files/folder-names") +
theme_bw(base_size = base_size) +
ggtitle(paste0("Significantly more countries with\nlow corruption values in Wikileaks \n(", chi_sq_output, ")"))
# ---------------------------------------------------------------------------
library(readxl)
library(httr)
the_url <- "http://www.anderson.ucla.edu/faculty_pages/romain.wacziarg/downloads/fractionalization.xls"
GET(the_url, write_disk(the_excel_file <- tempfile(fileext = ".xls")))
ef <- read_excel(the_excel_file, skip = 1)
ef_clean <-
ef %>%
select(Country, Ethnic) %>%
mutate(Country = tolower(Country),
Ethnic = as.numeric(Ethnic)) %>%
mutate(Country = if_else(Country == "the united states of america", "us",
if_else(Country == "united kingdom", "uk",
Country))) %>%
na.omit()
# see how this correlates with other things...
ef_and_osi <-
left_join(ef_clean,
og_clean,
by = c("Country" = "countries")) %>%
na.omit
ggplot(ef_and_osi,
aes(Ethnic,
openness)) +
geom_text(aes(label = Country)) +
geom_smooth(method = "lm") +
stat_poly_eq(aes(label = paste(..eq.label..,
..adj.rr.label..,
sep = "~~~~")),
formula = formula,
rr.digits = 3,
coef.digits = 2,
parse = TRUE, hjust = -0.25) +
theme_bw()
ef_and_ti <-
left_join(ef_clean,
ti_clean) %>%
na.omit
ggplot(ef_and_ti,
aes(Ethnic,
CPI2016)) +
geom_text(aes(label = Country)) +
geom_smooth(method = "lm") +
stat_poly_eq(aes(label = paste(..eq.label..,
..adj.rr.label..,
sep = "~~~~")),
formula = formula,
rr.digits = 3,
coef.digits = 2,
parse = TRUE, hjust = -0.25) +
theme_bw()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment