briatte · August 12, 2020 18:13
diff --git a/cleaning_characters.r b/cleaning_characters.r
 # removes characters from multiple data frame columns
 # example removes * , % characters

 dw <- data.frame(gsub("\\*|,|%", "", as.matrix(dw)), stringsAsFactors = FALSE)
diff --git a/download_DTA_export_CSV.r b/download_DTA_export_CSV.r
 # Download Quality of Government Basic dataset.
 file = "data/qog-cs.txt"
 if(!file.exists(file)) {
  if(!file.exists(dta <- "data/qog-cs.dta")) {
    url = "http://www.qogdata.pol.gu.se/data/qog_std_cs.dta"
    download(url, dta, mode = "wb")
  }
  write.csv(read.dta(dta), file)
 }
 # Open local copy.
 qog <- read.csv(file, stringsAsFactors = FALSE, header = TRUE)
diff --git a/geocode_QOG_data.r b/geocode_QOG_data.r
 # basic countrycode scheme
 # example with Quality of Government data

 # Add geographic continents using UN country codes.
 continent <- factor(countrycode(qog$ccodealp, "iso3c", "continent"))
diff --git a/load_packages.r b/load_packages.r
 # installs them first if needed
 # you could add quiet options

 # Load packages.
 packages <- c("ggplot2", "RCurl")
 packages <- lapply(packages, FUN = function(x) {
  if(!require(x, character.only = TRUE)) {
    install.packages(x)
    library(x, character.only = TRUE)
  }
 })
diff --git a/merge_with_map.r b/merge_with_map.r
 # merge map data and dataset
 # averaging over the map data seems to fill it better than merging

 # Transpose data to map dataset.
 stateShapes$Swing   <- by(dw$Obama_Swing, uniqueStates, mean)[stateShapes$region]
diff --git a/plot_from_datawrapper.r b/plot_from_datawrapper.r
 ## http://lemonde.fr/societe/article/2013/01/09/qui-approuve-le-mariage-homosexuel_1814480_3224.html

 library(ggplot2)
 library(reshape)

 ## Fetch data
 str(x <- read.csv("http://datawrapper.de//chart/xw9NF/data", sep="\t"))
 head(x)

 ## Variable names
 names(x) <- c("party",gsub("X","",names(x)[2:7]))

 ## Reshape
 head(x <- melt(x, id = "party", variable_name = "t"))

 ## Line chart
 ggplot(x, aes(x=t, y=value, group=party, color=party)) + geom_line()

 ## didn't find how to code it with gvisLineChart
diff --git a/plot_ngrams.r b/plot_ngrams.r

 library(ngramr)
 x = c("intellectual property rights", "property rights", "patents", "copyright", "digital copyright")
 ggram(paste0("(", x, "*5)"),
      year_start = 1980,
      corpus = "eng_2012", 
      geom = "point")
diff --git a/plot_QOG_countries.r b/plot_QOG_countries.r
 require(countrycode)
 require(ggplot2)
 qog <- read.csv("data/qog_cs.csv", sep = ";")
 qog$continent <- countrycode(qog$ccodealp, "iso3c", "continent")
 qplot(data = qog, y = wdi_fr, x = log(wdi_gdpc), 
      color = continent, label = ccodealp, geom = "text")
diff --git a/reorder_factors.r b/reorder_factors.r
 # orders factors based on another variable
 # see ?reorder for other options

 dw$State <- with(dw, reorder(State, Obama_Swing), ordered = TRUE)
diff --git a/reorder_variables.r b/reorder_variables.r
 # http://stackoverflow.com/questions/12544888/is-there-an-equivalent-r-function-to-stata-order-command/12547400#12547400

 move <- function(data,variable,before) {
  m <- data[variable]
  r <- data[names(data)!=variable]
  i <- match(before,names(data))
  pre <- r[1:i-1]
  post <- r[i:length(names(r))]
  cbind(pre,m,post)
 }

 # example
 library(MASS)
 data(painters)
 str(painters)

 # Move 'Expression' variable before 'Drawing' variable.
 new <- move(painters,"Expression","Drawing")
 View(new)
diff --git a/save_CSV_data.r b/save_CSV_data.r
 # uses downloader to save CSV spreadsheet
 # example with Quality of Government data

 # Download Quality of Government Basic dataset.
 file = "data/QOG.Basic.CS.csv"
 if(!file.exists(file)) {
  url = "http://www.qog.pol.gu.se/digitalAssets/1373/1373417_qog_basic_cs_csv_120608.csv"
  download(url, file, mode = "wb")
 }
 # Open local copy.
 qog <- read.csv(file, stringsAsFactors = FALSE, header = TRUE)
diff --git a/save_google_spreadsheet.r b/save_google_spreadsheet.r
 # uses RCUrl to download from CSV link
 # example with Daily Kos data

 # Create a filename for the dataset.
 file = "data/DKOS.US.0812.txt"
 # Store the address of the spreadsheet.
 link = "https://docs.google.com/spreadsheet/pub?key=0Av8O-dN2giY6dEFCOFZ4ZnlKS0x3M3Y0WHd5aWFDWkE&output=csv"
 # Download dataset.
 if (!file.exists(file)) {
  message("Dowloading the data...")
  # Download and read HTML spreadsheet.
  html <- textConnection(getURL(link, ssl.verifypeer = FALSE))
  # Convert and export CSV spreadsheet.
  write.csv(read.csv(html), file)
 }
 # Open file.
 dkos <- read.csv(file, stringsAsFactors = FALSE)
 # Check result.
 str(dkos)
diff --git a/save_PDF_file.r b/save_PDF_file.r
 # uses downloader to save documentation files
 # example with Quality of Government codebook

 # Download Quality of Government Basic codebook.
 file = "data/QOG.Basic.codebook.pdf"
 if(!file.exists(file)) {
  url = "http://www.qogdata.pol.gu.se/codebook/codebook_basic_20120608.pdf"
  download(url, file, mode = "wb")
 }
diff --git a/scrape_bea_table.r b/scrape_bea_table.r
 # gets real income growth data from the BEA website (for US and each state separately)

 library(downloader)
 url = "http://www.bea.gov/iTable/download.cfm?ext=csv&fid=95B00AEA0D0F4551F3FFE2DEF6A88527CAA3FD0670540E68A18BEE95735B3DBD9AAF7DB4AD82265972EC1C8BB81FC4EBCACF89FB20F318A84DD56C7E821137F8"
 file = "real_inc.csv"
 download(url, file)
 file <- read.csv(file, skip = 4)
 head(file)
diff --git a/scrape_html_data.r b/scrape_html_data.r
 # for vlad
 # first take at your mission
 library(foreach)
 library(RCurl)
 library(XML)

 scrape <- function(x) {
  # for security wrap in try(textConnection())
  page <- getURL(x)
  html <- htmlTreeParse(page, useInternal = TRUE, encoding = "UTF-8")
  # extract title
  node <- getNodeSet(html, path = "//td[@class='headline']")
  text <- xmlValue(node[[1]])
  # results = address + title
  return(c(x,text))
 }

 url = "http://www.cdep.ro/pls/proiecte/upl_pck.proiect?cam=1&idp="

 # test run on first 10 pages
 data <- foreach(i = 1:10) %do% scrape(paste(url,i,sep=""))
diff --git a/scrape_wikipedia_table.r b/scrape_wikipedia_table.r
 # there is a more complex way with readHTMLTable(getNode(htmlParse, ...))
 # example with electoral college, 5th table

 # Electoral college votes, 2012.
 url = "http://en.wikipedia.org/wiki/Electoral_College_(United_States)"
 # Extract fifth table.
 college <- readHTMLTable(url, which = 5, stringsAsFactors = FALSE)
 # Keep first and last columns, removing total electors.
 college <- data.frame(State = college[, 1], 
                      College = as.numeric(college[, 35]))
 # Merge to main dataset.
 dw <- merge(dw, college, by = "State")
 # Check result.
 str(dw)
diff --git a/tsv.r b/tsv.r
 # from Brendan O'Connor's "Better I/O routines"

 util$read.tsv <- function(..., header=F, sep='\t', quote='', comment='', na.strings='', stringsAsFactors=FALSE) {
  # read.table() wrapper with default settings for no-nonsense, pure TSV
  # Typical use case is output from another program.
  # (R's defaults are more geared for human-readable datafiles, which is less
  # feasible for large-scale data anyway.)
  # These options are substantially faster than read.table() defaults.
  #   (see e.g. LINK)
  # stringsAsFactors is the devil.
 
  args = list(...)
  args$header = header
  if (!is.null(args$col.names)) {
    # read.delim() is not smart about this.  Yikes.
    args$header = FALSE
  }
  args$sep = sep
  args$quote = quote
  args$comment = comment
  args$stringsAsFactors = stringsAsFactors
  args$na.strings = na.strings
  do.call(read.delim, args)
 }
 
 util$write.tsv <- function(..., header=NA, col.names=F, row.names=F, sep='\t', na='', quote=F) {
  # 'header' to 'col.names' naming consistency with read.table()
  if (is.finite(header)) col.names = header
  write.table(..., col.names=col.names, row.names=row.names, sep=sep, na=na, quote=quote)
 }
	# removes characters from multiple data frame columns
	# example removes * , % characters

	dw <- data.frame(gsub("\\*\|,\|%", "", as.matrix(dw)), stringsAsFactors = FALSE)
	# Download Quality of Government Basic dataset.
	file = "data/qog-cs.txt"
	if(!file.exists(file)) {
	if(!file.exists(dta <- "data/qog-cs.dta")) {
	url = "http://www.qogdata.pol.gu.se/data/qog_std_cs.dta"
	download(url, dta, mode = "wb")
	}
	write.csv(read.dta(dta), file)
	}
	# Open local copy.
	qog <- read.csv(file, stringsAsFactors = FALSE, header = TRUE)
	# basic countrycode scheme
	# example with Quality of Government data

	# Add geographic continents using UN country codes.
	continent <- factor(countrycode(qog$ccodealp, "iso3c", "continent"))
	# installs them first if needed
	# you could add quiet options

	# Load packages.
	packages <- c("ggplot2", "RCurl")
	packages <- lapply(packages, FUN = function(x) {
	if(!require(x, character.only = TRUE)) {
	install.packages(x)
	library(x, character.only = TRUE)
	}
	})
	# merge map data and dataset
	# averaging over the map data seems to fill it better than merging

	# Transpose data to map dataset.
	stateShapes$Swing <- by(dw$Obama_Swing, uniqueStates, mean)[stateShapes$region]
	## http://lemonde.fr/societe/article/2013/01/09/qui-approuve-le-mariage-homosexuel_1814480_3224.html

	library(ggplot2)
	library(reshape)

	## Fetch data
	str(x <- read.csv("http://datawrapper.de//chart/xw9NF/data", sep="\t"))
	head(x)

	## Variable names
	names(x) <- c("party",gsub("X","",names(x)[2:7]))

	## Reshape
	head(x <- melt(x, id = "party", variable_name = "t"))

	## Line chart
	ggplot(x, aes(x=t, y=value, group=party, color=party)) + geom_line()

	## didn't find how to code it with gvisLineChart

	library(ngramr)
	x = c("intellectual property rights", "property rights", "patents", "copyright", "digital copyright")
	ggram(paste0("(", x, "*5)"),
	year_start = 1980,
	corpus = "eng_2012",
	geom = "point")
	require(countrycode)
	require(ggplot2)
	qog <- read.csv("data/qog_cs.csv", sep = ";")
	qog$continent <- countrycode(qog$ccodealp, "iso3c", "continent")
	qplot(data = qog, y = wdi_fr, x = log(wdi_gdpc),
	color = continent, label = ccodealp, geom = "text")
	# orders factors based on another variable
	# see ?reorder for other options

	dw$State <- with(dw, reorder(State, Obama_Swing), ordered = TRUE)
	# http://stackoverflow.com/questions/12544888/is-there-an-equivalent-r-function-to-stata-order-command/12547400#12547400

	move <- function(data,variable,before) {
	m <- data[variable]
	r <- data[names(data)!=variable]
	i <- match(before,names(data))
	pre <- r[1:i-1]
	post <- r[i:length(names(r))]
	cbind(pre,m,post)
	}

	# example
	library(MASS)
	data(painters)
	str(painters)

	# Move 'Expression' variable before 'Drawing' variable.
	new <- move(painters,"Expression","Drawing")
	View(new)