Last active
August 12, 2020 18:13
-
-
Save briatte/5448112 to your computer and use it in GitHub Desktop.
a few lines of code to load packages, download and scrape data, convert, sort, etc.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# removes characters from multiple data frame columns | |
# example removes * , % characters | |
dw <- data.frame(gsub("\\*|,|%", "", as.matrix(dw)), stringsAsFactors = FALSE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Download Quality of Government Basic dataset. | |
file = "data/qog-cs.txt" | |
if(!file.exists(file)) { | |
if(!file.exists(dta <- "data/qog-cs.dta")) { | |
url = "http://www.qogdata.pol.gu.se/data/qog_std_cs.dta" | |
download(url, dta, mode = "wb") | |
} | |
write.csv(read.dta(dta), file) | |
} | |
# Open local copy. | |
qog <- read.csv(file, stringsAsFactors = FALSE, header = TRUE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# basic countrycode scheme | |
# example with Quality of Government data | |
# Add geographic continents using UN country codes. | |
continent <- factor(countrycode(qog$ccodealp, "iso3c", "continent")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# installs them first if needed | |
# you could add quiet options | |
# Load packages. | |
packages <- c("ggplot2", "RCurl") | |
packages <- lapply(packages, FUN = function(x) { | |
if(!require(x, character.only = TRUE)) { | |
install.packages(x) | |
library(x, character.only = TRUE) | |
} | |
}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# merge map data and dataset | |
# averaging over the map data seems to fill it better than merging | |
# Transpose data to map dataset. | |
stateShapes$Swing <- by(dw$Obama_Swing, uniqueStates, mean)[stateShapes$region] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## http://lemonde.fr/societe/article/2013/01/09/qui-approuve-le-mariage-homosexuel_1814480_3224.html | |
library(ggplot2) | |
library(reshape) | |
## Fetch data | |
str(x <- read.csv("http://datawrapper.de//chart/xw9NF/data", sep="\t")) | |
head(x) | |
## Variable names | |
names(x) <- c("party",gsub("X","",names(x)[2:7])) | |
## Reshape | |
head(x <- melt(x, id = "party", variable_name = "t")) | |
## Line chart | |
ggplot(x, aes(x=t, y=value, group=party, color=party)) + geom_line() | |
## didn't find how to code it with gvisLineChart |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ngramr) | |
x = c("intellectual property rights", "property rights", "patents", "copyright", "digital copyright") | |
ggram(paste0("(", x, "*5)"), | |
year_start = 1980, | |
corpus = "eng_2012", | |
geom = "point") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(countrycode) | |
require(ggplot2) | |
qog <- read.csv("data/qog_cs.csv", sep = ";") | |
qog$continent <- countrycode(qog$ccodealp, "iso3c", "continent") | |
qplot(data = qog, y = wdi_fr, x = log(wdi_gdpc), | |
color = continent, label = ccodealp, geom = "text") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# orders factors based on another variable | |
# see ?reorder for other options | |
dw$State <- with(dw, reorder(State, Obama_Swing), ordered = TRUE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# http://stackoverflow.com/questions/12544888/is-there-an-equivalent-r-function-to-stata-order-command/12547400#12547400 | |
move <- function(data,variable,before) { | |
m <- data[variable] | |
r <- data[names(data)!=variable] | |
i <- match(before,names(data)) | |
pre <- r[1:i-1] | |
post <- r[i:length(names(r))] | |
cbind(pre,m,post) | |
} | |
# example | |
library(MASS) | |
data(painters) | |
str(painters) | |
# Move 'Expression' variable before 'Drawing' variable. | |
new <- move(painters,"Expression","Drawing") | |
View(new) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# uses downloader to save CSV spreadsheet | |
# example with Quality of Government data | |
# Download Quality of Government Basic dataset. | |
file = "data/QOG.Basic.CS.csv" | |
if(!file.exists(file)) { | |
url = "http://www.qog.pol.gu.se/digitalAssets/1373/1373417_qog_basic_cs_csv_120608.csv" | |
download(url, file, mode = "wb") | |
} | |
# Open local copy. | |
qog <- read.csv(file, stringsAsFactors = FALSE, header = TRUE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# uses RCUrl to download from CSV link | |
# example with Daily Kos data | |
# Create a filename for the dataset. | |
file = "data/DKOS.US.0812.txt" | |
# Store the address of the spreadsheet. | |
link = "https://docs.google.com/spreadsheet/pub?key=0Av8O-dN2giY6dEFCOFZ4ZnlKS0x3M3Y0WHd5aWFDWkE&output=csv" | |
# Download dataset. | |
if (!file.exists(file)) { | |
message("Dowloading the data...") | |
# Download and read HTML spreadsheet. | |
html <- textConnection(getURL(link, ssl.verifypeer = FALSE)) | |
# Convert and export CSV spreadsheet. | |
write.csv(read.csv(html), file) | |
} | |
# Open file. | |
dkos <- read.csv(file, stringsAsFactors = FALSE) | |
# Check result. | |
str(dkos) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# uses downloader to save documentation files | |
# example with Quality of Government codebook | |
# Download Quality of Government Basic codebook. | |
file = "data/QOG.Basic.codebook.pdf" | |
if(!file.exists(file)) { | |
url = "http://www.qogdata.pol.gu.se/codebook/codebook_basic_20120608.pdf" | |
download(url, file, mode = "wb") | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# gets real income growth data from the BEA website (for US and each state separately) | |
library(downloader) | |
url = "http://www.bea.gov/iTable/download.cfm?ext=csv&fid=95B00AEA0D0F4551F3FFE2DEF6A88527CAA3FD0670540E68A18BEE95735B3DBD9AAF7DB4AD82265972EC1C8BB81FC4EBCACF89FB20F318A84DD56C7E821137F8" | |
file = "real_inc.csv" | |
download(url, file) | |
file <- read.csv(file, skip = 4) | |
head(file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# for vlad | |
# first take at your mission | |
library(foreach) | |
library(RCurl) | |
library(XML) | |
scrape <- function(x) { | |
# for security wrap in try(textConnection()) | |
page <- getURL(x) | |
html <- htmlTreeParse(page, useInternal = TRUE, encoding = "UTF-8") | |
# extract title | |
node <- getNodeSet(html, path = "//td[@class='headline']") | |
text <- xmlValue(node[[1]]) | |
# results = address + title | |
return(c(x,text)) | |
} | |
url = "http://www.cdep.ro/pls/proiecte/upl_pck.proiect?cam=1&idp=" | |
# test run on first 10 pages | |
data <- foreach(i = 1:10) %do% scrape(paste(url,i,sep="")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# there is a more complex way with readHTMLTable(getNode(htmlParse, ...)) | |
# example with electoral college, 5th table | |
# Electoral college votes, 2012. | |
url = "http://en.wikipedia.org/wiki/Electoral_College_(United_States)" | |
# Extract fifth table. | |
college <- readHTMLTable(url, which = 5, stringsAsFactors = FALSE) | |
# Keep first and last columns, removing total electors. | |
college <- data.frame(State = college[, 1], | |
College = as.numeric(college[, 35])) | |
# Merge to main dataset. | |
dw <- merge(dw, college, by = "State") | |
# Check result. | |
str(dw) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# from Brendan O'Connor's "Better I/O routines" | |
util$read.tsv <- function(..., header=F, sep='\t', quote='', comment='', na.strings='', stringsAsFactors=FALSE) { | |
# read.table() wrapper with default settings for no-nonsense, pure TSV | |
# Typical use case is output from another program. | |
# (R's defaults are more geared for human-readable datafiles, which is less | |
# feasible for large-scale data anyway.) | |
# These options are substantially faster than read.table() defaults. | |
# (see e.g. LINK) | |
# stringsAsFactors is the devil. | |
args = list(...) | |
args$header = header | |
if (!is.null(args$col.names)) { | |
# read.delim() is not smart about this. Yikes. | |
args$header = FALSE | |
} | |
args$sep = sep | |
args$quote = quote | |
args$comment = comment | |
args$stringsAsFactors = stringsAsFactors | |
args$na.strings = na.strings | |
do.call(read.delim, args) | |
} | |
util$write.tsv <- function(..., header=NA, col.names=F, row.names=F, sep='\t', na='', quote=F) { | |
# 'header' to 'col.names' naming consistency with read.table() | |
if (is.finite(header)) col.names = header | |
write.table(..., col.names=col.names, row.names=row.names, sep=sep, na=na, quote=quote) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment