Created
May 21, 2013 17:08
-
-
Save johnjosephhorton/5621448 to your computer and use it in GitHub Desktop.
Get country-specific wages from Wikipedia and plot them with ggplot2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################ | |
# AUTHOR: John Horton | |
# PURPOSE: Extract by-country minimum hourly wages from the Wikipedia page | |
# LAST MODIFIED: May 22, 2013 | |
############################################################################ | |
library(XML) | |
library(ggplot2) | |
library(scales) | |
url <- "http://en.wikipedia.org/wiki/List_of_minimum_wages_by_country" | |
raw <- readHTMLTable(doc=url) | |
df.raw <- raw[[2]] | |
colnames(df.raw) <- c("country", "minimum_wage", "annual", "annual_ppp", "workweek", | |
"hourly_usd", "hourly_intl", "perc_2011_gdp", | |
"effective") | |
Clean.Wage <- Vectorize(function(x){ | |
"There is some HTML junk & idiosyncracies in the wages that this fixed" | |
x1 <- gsub("US$", "", x) # some have US$ prefix | |
x2 <- substring(x1, first = 20) #get rid of span meta-data that XML picks up | |
as.numeric(x2) | |
}) | |
Clean.Country <- Vectorize(function(x) { | |
"Country has a '_' appended to it--this strips it out" | |
substring(x, first = 2) | |
}) | |
df <- with(df.raw, data.frame(country = Clean.Country(country), min.wage = Clean.Wage(hourly_usd))) | |
Make.MW.plot <- function(df, label){ | |
"This makes a minimum wage plot per country based on a passed data frame. | |
There are a large number of countries, so the idea here is to split the | |
data into subsets & plot them individually. | |
" | |
title = paste("Hourly minimum wages by country \n", label, sep = "") | |
qplot(country, min.wage, data = df) + | |
ylab("Hourly Wage (USD) \n \n Source: Wikipedia, May, 21, 2013 | |
en.wikipedia.org/wiki/List_of_minimum_wages_by_country") + | |
xlab("") + | |
scale_y_continuous(labels = dollar) + coord_flip() + | |
theme_bw() + expand_limits(y = 0) + | |
ggtitle(title) | |
} | |
# Split countries into quartiles | |
## > summary(df$min.wage) | |
## Min. 1st Qu. Median Mean 3rd Qu. Max. | |
## 0.030 0.490 1.180 2.308 2.460 16.450 | |
g.75 <- Make.MW.plot(subset(df, min.wage > 2.5), label = "> 2.50") | |
g.50 <- Make.MW.plot(subset(df, min.wage > 1.25 & min.wage <= 2.50), label = "1.25 < Minimum Wage <= 2.50") | |
g.25 <- Make.MW.plot(subset(df, min.wage > 0.50 & min.wage <= 1.25), label = "0.50 < Minimum Wage <= 1.25") | |
g.0 <- Make.MW.plot(subset(df, min.wage <= 0.50), label = "Minimum Wage <= 0.50") | |
Write.Image <- function(filename, g, width = 500, height = 500, format = "png"){ | |
"Writes a passed ggplot, g, to the filename. The default format is png." | |
do.call(format, list(filename, width, height)) | |
print(g) | |
dev.off() | |
} | |
Write.Image("./minimum_wage_plots/quartile_75.png", g.75) | |
Write.Image("./minimum_wage_plots/quartile_50.png", g.50) | |
Write.Image("./minimum_wage_plots/quartile_25.png", g.25) | |
Write.Image("./minimum_wage_plots/quartile_0.png", g.0) | |
# Bonus plot - kernel density estimate of distribution of hourly minimums | |
g.distro <- qplot(min.wage, geom="density", data = df) + | |
scale_x_log10(labels = dollar) + | |
xlab("Hourly minimum wages in USD, log scale \n Source: Wikipedia, May, 21, 2013 | |
en.wikipedia.org/wiki/List_of_minimum_wages_by_country") + | |
theme_bw() | |
Write.Image("./minimum_wage_plots/distr.png", g.distro) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment