Skip to content

Instantly share code, notes, and snippets.

@briatte
Last active August 12, 2020 18:13
Show Gist options
  • Save briatte/5448112 to your computer and use it in GitHub Desktop.
Save briatte/5448112 to your computer and use it in GitHub Desktop.
a few lines of code to load packages, download and scrape data, convert, sort, etc.
# removes characters from multiple data frame columns
# example removes * , % characters
dw <- data.frame(gsub("\\*|,|%", "", as.matrix(dw)), stringsAsFactors = FALSE)
# Download Quality of Government Basic dataset.
file = "data/qog-cs.txt"
if(!file.exists(file)) {
if(!file.exists(dta <- "data/qog-cs.dta")) {
url = "http://www.qogdata.pol.gu.se/data/qog_std_cs.dta"
download(url, dta, mode = "wb")
}
write.csv(read.dta(dta), file)
}
# Open local copy.
qog <- read.csv(file, stringsAsFactors = FALSE, header = TRUE)
# basic countrycode scheme
# example with Quality of Government data
# Add geographic continents using UN country codes.
continent <- factor(countrycode(qog$ccodealp, "iso3c", "continent"))
# installs them first if needed
# you could add quiet options
# Load packages.
packages <- c("ggplot2", "RCurl")
packages <- lapply(packages, FUN = function(x) {
if(!require(x, character.only = TRUE)) {
install.packages(x)
library(x, character.only = TRUE)
}
})
# merge map data and dataset
# averaging over the map data seems to fill it better than merging
# Transpose data to map dataset.
stateShapes$Swing <- by(dw$Obama_Swing, uniqueStates, mean)[stateShapes$region]
## http://lemonde.fr/societe/article/2013/01/09/qui-approuve-le-mariage-homosexuel_1814480_3224.html
library(ggplot2)
library(reshape)
## Fetch data
str(x <- read.csv("http://datawrapper.de//chart/xw9NF/data", sep="\t"))
head(x)
## Variable names
names(x) <- c("party",gsub("X","",names(x)[2:7]))
## Reshape
head(x <- melt(x, id = "party", variable_name = "t"))
## Line chart
ggplot(x, aes(x=t, y=value, group=party, color=party)) + geom_line()
## didn't find how to code it with gvisLineChart
library(ngramr)
x = c("intellectual property rights", "property rights", "patents", "copyright", "digital copyright")
ggram(paste0("(", x, "*5)"),
year_start = 1980,
corpus = "eng_2012",
geom = "point")
require(countrycode)
require(ggplot2)
qog <- read.csv("data/qog_cs.csv", sep = ";")
qog$continent <- countrycode(qog$ccodealp, "iso3c", "continent")
qplot(data = qog, y = wdi_fr, x = log(wdi_gdpc),
color = continent, label = ccodealp, geom = "text")
# orders factors based on another variable
# see ?reorder for other options
dw$State <- with(dw, reorder(State, Obama_Swing), ordered = TRUE)
# http://stackoverflow.com/questions/12544888/is-there-an-equivalent-r-function-to-stata-order-command/12547400#12547400
move <- function(data,variable,before) {
m <- data[variable]
r <- data[names(data)!=variable]
i <- match(before,names(data))
pre <- r[1:i-1]
post <- r[i:length(names(r))]
cbind(pre,m,post)
}
# example
library(MASS)
data(painters)
str(painters)
# Move 'Expression' variable before 'Drawing' variable.
new <- move(painters,"Expression","Drawing")
View(new)
# uses downloader to save CSV spreadsheet
# example with Quality of Government data
# Download Quality of Government Basic dataset.
file = "data/QOG.Basic.CS.csv"
if(!file.exists(file)) {
url = "http://www.qog.pol.gu.se/digitalAssets/1373/1373417_qog_basic_cs_csv_120608.csv"
download(url, file, mode = "wb")
}
# Open local copy.
qog <- read.csv(file, stringsAsFactors = FALSE, header = TRUE)
# uses RCUrl to download from CSV link
# example with Daily Kos data
# Create a filename for the dataset.
file = "data/DKOS.US.0812.txt"
# Store the address of the spreadsheet.
link = "https://docs.google.com/spreadsheet/pub?key=0Av8O-dN2giY6dEFCOFZ4ZnlKS0x3M3Y0WHd5aWFDWkE&output=csv"
# Download dataset.
if (!file.exists(file)) {
message("Dowloading the data...")
# Download and read HTML spreadsheet.
html <- textConnection(getURL(link, ssl.verifypeer = FALSE))
# Convert and export CSV spreadsheet.
write.csv(read.csv(html), file)
}
# Open file.
dkos <- read.csv(file, stringsAsFactors = FALSE)
# Check result.
str(dkos)
# uses downloader to save documentation files
# example with Quality of Government codebook
# Download Quality of Government Basic codebook.
file = "data/QOG.Basic.codebook.pdf"
if(!file.exists(file)) {
url = "http://www.qogdata.pol.gu.se/codebook/codebook_basic_20120608.pdf"
download(url, file, mode = "wb")
}
# gets real income growth data from the BEA website (for US and each state separately)
library(downloader)
url = "http://www.bea.gov/iTable/download.cfm?ext=csv&fid=95B00AEA0D0F4551F3FFE2DEF6A88527CAA3FD0670540E68A18BEE95735B3DBD9AAF7DB4AD82265972EC1C8BB81FC4EBCACF89FB20F318A84DD56C7E821137F8"
file = "real_inc.csv"
download(url, file)
file <- read.csv(file, skip = 4)
head(file)
# for vlad
# first take at your mission
library(foreach)
library(RCurl)
library(XML)
scrape <- function(x) {
# for security wrap in try(textConnection())
page <- getURL(x)
html <- htmlTreeParse(page, useInternal = TRUE, encoding = "UTF-8")
# extract title
node <- getNodeSet(html, path = "//td[@class='headline']")
text <- xmlValue(node[[1]])
# results = address + title
return(c(x,text))
}
url = "http://www.cdep.ro/pls/proiecte/upl_pck.proiect?cam=1&idp="
# test run on first 10 pages
data <- foreach(i = 1:10) %do% scrape(paste(url,i,sep=""))
# there is a more complex way with readHTMLTable(getNode(htmlParse, ...))
# example with electoral college, 5th table
# Electoral college votes, 2012.
url = "http://en.wikipedia.org/wiki/Electoral_College_(United_States)"
# Extract fifth table.
college <- readHTMLTable(url, which = 5, stringsAsFactors = FALSE)
# Keep first and last columns, removing total electors.
college <- data.frame(State = college[, 1],
College = as.numeric(college[, 35]))
# Merge to main dataset.
dw <- merge(dw, college, by = "State")
# Check result.
str(dw)
# from Brendan O'Connor's "Better I/O routines"
util$read.tsv <- function(..., header=F, sep='\t', quote='', comment='', na.strings='', stringsAsFactors=FALSE) {
# read.table() wrapper with default settings for no-nonsense, pure TSV
# Typical use case is output from another program.
# (R's defaults are more geared for human-readable datafiles, which is less
# feasible for large-scale data anyway.)
# These options are substantially faster than read.table() defaults.
# (see e.g. LINK)
# stringsAsFactors is the devil.
args = list(...)
args$header = header
if (!is.null(args$col.names)) {
# read.delim() is not smart about this. Yikes.
args$header = FALSE
}
args$sep = sep
args$quote = quote
args$comment = comment
args$stringsAsFactors = stringsAsFactors
args$na.strings = na.strings
do.call(read.delim, args)
}
util$write.tsv <- function(..., header=NA, col.names=F, row.names=F, sep='\t', na='', quote=F) {
# 'header' to 'col.names' naming consistency with read.table()
if (is.finite(header)) col.names = header
write.table(..., col.names=col.names, row.names=row.names, sep=sep, na=na, quote=quote)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment