Nimster · February 11, 2013 20:49
diff --git a/intro_to_R b/intro_to_R
 ######### Intro to R ###############

 ######### The Data Frame ###########
 df <- data.frame(
  row.names = c('HaLikud', 'Yesh Atid', 'HaAvoda', 'HaBait HaYehudi', 'Yehadut HaTora', 'Meretz', 'Shas'),
  LeaderName = c('Netanyahu', 'Lapid', 'Yehimovitch', 'Bennet', 'Litzman', 'GalOn', 'Yishai'),
  Category = c('Right', 'Center',  'Left', 'Right', 'Religious', 'Left', 'Religious'),
  Mandates = c(31, 19, 15, 12, 7, 6, 11)
  )

 df

 colnames(df)
 rownames(df)

 df[df$Category == 'Right', ]
 df[df$Category == 'Right', "Mandates"]
 df[df$Category == 'Right', c("Mandates", "LeaderName")]
 df[df$Category == 'Right', c(3, 1)]
 df[df$Category == 'Right', -2]

 df[(df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15), ]

 (df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15)

 which( (df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15) )

 df[ which( (df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15) ) , ]

 df[ which( (df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15) ) , "Mandates"] <- 999

 df[ df$Mandates > 500 , "Mandates"] <- NA

 df[! is.na(df$Mandates), ]

 df[df$Category %in% c('Right', 'Left'), ]

 with(df, Mandates * 3)
 Mandates # BOO; So how did this work?
 # Rubyists: understand this as a ruby block (closure)
 with(df, { print("HERE!")
           Mandates * 3 })
 # But this is unique
 transform(df, remaining = 61 - Mandates, logvoters = log(22500 * Mandates))

 ## Factors
 df$Category

 ## Matrices
 matrix(c(1,2,3,4,5,6,7,8), nrow = 4, ncol = 2)

 matrix(c(1,2,3,4,5,6,7,8), nrow = 4, ncol = 2) %*% matrix(c(1,2,3,-1,-2,-3), nrow = 2, ncol = 3)

 ## Vectors
 1:8

 c(1:4, 10:15)

 rep(seq(1, 2, by = 0.2), each = 3, times = 2)

 ## Time Series
 ts(seq(100, 300, by = 5), start = 1960, freq = 12) 

 ## Everything is vectorized
 sqrt((1:10) ** 2)
 df$Large <- ifelse(df$Mandates > 10, TRUE, FALSE)
 df
 rownames(df) <- paste(rownames(df), df$LeaderName, sep=' BeRashut ')
 df

 ## R is functional
 f <- function(x, y) {
  (x + y) / (x - y)
 }

 f(3, 4)
 f(2:3, 4:5)

 outer(1:3, 4:6, FUN=f)

 ?binom.test
 binom.test(50, 100)
 ls(binom.test(50, 100))
 binom.test(50, 100)$conf.int
 binom.test(50, 100)$conf.int[1]
 binom.test(50, 100)$conf.int * 100
 as.vector(binom.test(50, 100)$conf.int * 100)

 s <- function(x, n) { as.vector(binom.test(x, n)$conf.int * n) }
 s(50, 100)
 s(50:55, 100) # BOO
 vs <- Vectorize(s, "x")
 vs(50:55, 100)
 x <- seq(10, 40, by = 10)
 names(x) <- x
 n <- seq(100, 400, by = 100)
 names(n) <- n
 vs2 <- Vectorize(s, c("x", "n"))
 vs2(x, 100)
 vs2(50, n)
 vs2(x, n) # vs(x[1], n[1]), vs(x[2], n[2]), vs(x[3], n[3]), vs(x[4], n[4])

 # Like this? explore mapply, ddply (plyr package), etc.

 ## R integrates well
 install.packages('gdata')
 library(gdata)

 xl <- read.xls('~/Downloads/eurostat_dirty.xlsx', sheet=1, na.strings=':', stringsAsFactors = F)
 xl
 xl <- xl[-(40:nrow(xl)), ]
 rownames(xl) <- xl[, 1]
 xl <- xl[, seq(2, ncol(xl), by = 2)]
 xl <- xl[, -1]
 colnames(xl) <- paste("Y", xl[2, ], sep='')
 xl <- xl[-(1:4), ]

 colSums(xl)
 colSums(xl, na.rm = T)
 xl <- na.omit(xl)
 apply(xl, MARGIN=2, max)
 apply(xl, MARGIN=2, mean)
 summary(xl)

 ## Advanced data processing
 xl$Bed_Category <- cut(xl$Y2011, c(0, 30, 100, Inf), labels = c("Little", "Medium", "Lots"))
 ?tapply
 tapply(xl$Y2011, xl$Bed_Category, FUN = mean)

 x <- c(rep(1:3, each = 3, times = 2))
 x
 rle(x)

 # Where to advertise? A multi-armed bandit approach.
 sample(rownames(xl), 3, replace = T, xl$Y2011)

 cut(xl$Y2001, 3)  # Generate cut points automatically. Oh-oh
 quantile(xl$Y2001, probs = c(0, 0.25, 0.75, 1))
 xl$Old_Bed_Category <- cut(xl$Y2001, 
    quantile(xl$Y2011, probs = c(0, 0.25, 0.75, 1)), 
    labels = c('Low', 'Medium', 'High'), 
    right = T, include.lowest = T) # Include both ends of the range

 xl

 # in R, the questions is often "What's the function that does *THAT*?"
 ftable(xl[, c('Bed_Category', 'Old_Bed_Category')])

 ## Riddle: How do I find the problematic 5?
 ## ...
 ## ...
 ## ...
 rownames(xl[xl$Bed_Category == 'Little' & xl$Old_Bed_Category == 'Low', ])

 ## More data plays
 order(xl$Y2011) # Huh?
 xl[order(xl$Y2011), ] # ahhhh
 rank(xl$Y2011) # Inversed perm!

 ## Stats & Probability
 runif(5, 0, 3)
 rbinom(1, 100, 0.5)
 hist(rbinom(10, 100, 0.5))
 hist(rbinom(100, 100, 0.5))
 hist(rbinom(1000, 100, 0.5)) # CLT!
 hist(rnorm(1000, 1.5, 1))

 library(ggplot2)
 l <- rnorm(1000, 1.5, 1)
 p <- qplot(l, geom = 'histogram')
 p
 p + xlab("Coffee breaks per day")

 ## More cool IO
 library(XML)
 theurl <- "http://en.wikipedia.org/wiki/List_of_tallest_structures_in_the_world"
 tables <- readHTMLTable(theurl)
 n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
 tbl <- tables[[which.max(n.rows)]]
 tbl
 tbl <- tbl[, 1:6]
 colnames(tbl)[c(2, 4, 5)] <- c('Height', 'Type', 'Use')
 tbl$Height <- as.numeric(gsub('\\s.*', '', tbl$Height))
 tbl$Year[!grepl("\\d", tbl$Year)] <- NA
 tbl$Year <- as.numeric(sub("\\D.*", '', tbl$Year))

 qplot(data = tbl, x = Year, y = Height, color = Country)

 qplot(data = na.omit(tbl), x = Type, geom="bar")
 qplot(data = na.omit(tbl), x = Type, geom="bar", fill = Country)

 ## Some linear models
 mtcars

 ggplot(data=mtcars, aes(factor(cyl), fill=factor(gear))) + geom_bar(position="dodge")

 lm(data = mtcars, mpg ~ hp)
 l <- lm(data = mtcars, mpg ~ hp)
 summary(l) # Look at summary(l)$r.squared
 qplot(data = mtcars, x = hp, y = mpg, geom="point")
 qplot(data = mtcars, x = hp, y = mpg, geom="point") + geom_smooth(method = 'lm')

 predict(l)
 predict(l, newdata = data.frame(hp = seq(50, 300, by = 25)))

 l <- lm(data = mtcars, mpg ~ hp + I(hp^2))
 summary(l)
 qplot(data = mtcars, hp, mpg, geom="point") + geom_line(data = data.frame(hp=seq(50,350, by=3), mpg=predict(l, data.frame(hp=seq(50, 350, by=3)))), aes(x=hp, y=mpg))

 l <- lm(data = mtcars, mpg ~ hp + I(hp^2) + I(hp^3) + I(hp^4) + I(hp^5) + I(hp^6) + I(hp^7) + I(hp^8) + I(hp^9) + I(hp^10))
 qplot(data = mtcars, hp, mpg, geom="point") + geom_line(data = data.frame(hp=seq(50,335, by=3), mpg=predict(l, data.frame(hp=seq(50, 335, by=3)))), aes(x=hp, y=mpg))

 l <- lm(data = mtcars, mpg ~ hp + wt + cyl)
 summary(l)
 qplot(data = mtcars, x = wt, y = mpg, geom="point") + geom_smooth(method = 'lm')

 ## ... Incidentally...
 merge(tbl, xl, by.x='Country', by.y = 'row.names', all = F)
 # Pivots
 library(reshape)
 cast(df, Category ~ ., value='Mandates', fun.aggregate = sum, na.rm = T)
 cast(df, Category + Large ~ ., value='Mandates', fun.aggregate = length)
	######### Intro to R ###############

	######### The Data Frame ###########
	df <- data.frame(
	row.names = c('HaLikud', 'Yesh Atid', 'HaAvoda', 'HaBait HaYehudi', 'Yehadut HaTora', 'Meretz', 'Shas'),
	LeaderName = c('Netanyahu', 'Lapid', 'Yehimovitch', 'Bennet', 'Litzman', 'GalOn', 'Yishai'),
	Category = c('Right', 'Center', 'Left', 'Right', 'Religious', 'Left', 'Religious'),
	Mandates = c(31, 19, 15, 12, 7, 6, 11)
	)

	df

	colnames(df)
	rownames(df)

	df[df$Category == 'Right', ]
	df[df$Category == 'Right', "Mandates"]
	df[df$Category == 'Right', c("Mandates", "LeaderName")]
	df[df$Category == 'Right', c(3, 1)]
	df[df$Category == 'Right', -2]

	df[(df$Category == 'Right' & df$Mandates > 20) \| (df$Category == 'Left' & df$Mandates < 15), ]

	(df$Category == 'Right' & df$Mandates > 20) \| (df$Category == 'Left' & df$Mandates < 15)

	which( (df$Category == 'Right' & df$Mandates > 20) \| (df$Category == 'Left' & df$Mandates < 15) )

	df[ which( (df$Category == 'Right' & df$Mandates > 20) \| (df$Category == 'Left' & df$Mandates < 15) ) , ]

	df[ which( (df$Category == 'Right' & df$Mandates > 20) \| (df$Category == 'Left' & df$Mandates < 15) ) , "Mandates"] <- 999

	df[ df$Mandates > 500 , "Mandates"] <- NA

	df[! is.na(df$Mandates), ]

	df[df$Category %in% c('Right', 'Left'), ]

	with(df, Mandates * 3)
	Mandates # BOO; So how did this work?
	# Rubyists: understand this as a ruby block (closure)
	with(df, { print("HERE!")
	Mandates * 3 })
	# But this is unique
	transform(df, remaining = 61 - Mandates, logvoters = log(22500 * Mandates))

	## Factors
	df$Category

	## Matrices
	matrix(c(1,2,3,4,5,6,7,8), nrow = 4, ncol = 2)

	matrix(c(1,2,3,4,5,6,7,8), nrow = 4, ncol = 2) %*% matrix(c(1,2,3,-1,-2,-3), nrow = 2, ncol = 3)

	## Vectors
	1:8

	c(1:4, 10:15)

	rep(seq(1, 2, by = 0.2), each = 3, times = 2)

	## Time Series
	ts(seq(100, 300, by = 5), start = 1960, freq = 12)

	## Everything is vectorized
	sqrt((1:10) ** 2)
	df$Large <- ifelse(df$Mandates > 10, TRUE, FALSE)
	df
	rownames(df) <- paste(rownames(df), df$LeaderName, sep=' BeRashut ')
	df

	## R is functional
	f <- function(x, y) {
	(x + y) / (x - y)
	}

	f(3, 4)
	f(2:3, 4:5)

	outer(1:3, 4:6, FUN=f)

	?binom.test
	binom.test(50, 100)
	ls(binom.test(50, 100))
	binom.test(50, 100)$conf.int
	binom.test(50, 100)$conf.int[1]
	binom.test(50, 100)$conf.int * 100
	as.vector(binom.test(50, 100)$conf.int * 100)

	s <- function(x, n) { as.vector(binom.test(x, n)$conf.int * n) }
	s(50, 100)
	s(50:55, 100) # BOO
	vs <- Vectorize(s, "x")
	vs(50:55, 100)
	x <- seq(10, 40, by = 10)
	names(x) <- x
	n <- seq(100, 400, by = 100)
	names(n) <- n
	vs2 <- Vectorize(s, c("x", "n"))
	vs2(x, 100)
	vs2(50, n)
	vs2(x, n) # vs(x[1], n[1]), vs(x[2], n[2]), vs(x[3], n[3]), vs(x[4], n[4])

	# Like this? explore mapply, ddply (plyr package), etc.

	## R integrates well
	install.packages('gdata')
	library(gdata)

	xl <- read.xls('~/Downloads/eurostat_dirty.xlsx', sheet=1, na.strings=':', stringsAsFactors = F)
	xl
	xl <- xl[-(40:nrow(xl)), ]
	rownames(xl) <- xl[, 1]
	xl <- xl[, seq(2, ncol(xl), by = 2)]
	xl <- xl[, -1]
	colnames(xl) <- paste("Y", xl[2, ], sep='')
	xl <- xl[-(1:4), ]

	colSums(xl)
	colSums(xl, na.rm = T)
	xl <- na.omit(xl)
	apply(xl, MARGIN=2, max)
	apply(xl, MARGIN=2, mean)
	summary(xl)

	## Advanced data processing
	xl$Bed_Category <- cut(xl$Y2011, c(0, 30, 100, Inf), labels = c("Little", "Medium", "Lots"))
	?tapply
	tapply(xl$Y2011, xl$Bed_Category, FUN = mean)

	x <- c(rep(1:3, each = 3, times = 2))
	x
	rle(x)

	# Where to advertise? A multi-armed bandit approach.
	sample(rownames(xl), 3, replace = T, xl$Y2011)

	cut(xl$Y2001, 3) # Generate cut points automatically. Oh-oh
	quantile(xl$Y2001, probs = c(0, 0.25, 0.75, 1))
	xl$Old_Bed_Category <- cut(xl$Y2001,
	quantile(xl$Y2011, probs = c(0, 0.25, 0.75, 1)),
	labels = c('Low', 'Medium', 'High'),
	right = T, include.lowest = T) # Include both ends of the range

	xl

	# in R, the questions is often "What's the function that does THAT?"
	ftable(xl[, c('Bed_Category', 'Old_Bed_Category')])

	## Riddle: How do I find the problematic 5?
	## ...
	## ...
	## ...
	rownames(xl[xl$Bed_Category == 'Little' & xl$Old_Bed_Category == 'Low', ])

	## More data plays
	order(xl$Y2011) # Huh?
	xl[order(xl$Y2011), ] # ahhhh
	rank(xl$Y2011) # Inversed perm!

	## Stats & Probability
	runif(5, 0, 3)
	rbinom(1, 100, 0.5)
	hist(rbinom(10, 100, 0.5))
	hist(rbinom(100, 100, 0.5))
	hist(rbinom(1000, 100, 0.5)) # CLT!
	hist(rnorm(1000, 1.5, 1))

	library(ggplot2)
	l <- rnorm(1000, 1.5, 1)
	p <- qplot(l, geom = 'histogram')
	p
	p + xlab("Coffee breaks per day")

	## More cool IO
	library(XML)
	theurl <- "http://en.wikipedia.org/wiki/List_of_tallest_structures_in_the_world"
	tables <- readHTMLTable(theurl)
	n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
	tbl <- tables[[which.max(n.rows)]]
	tbl
	tbl <- tbl[, 1:6]
	colnames(tbl)[c(2, 4, 5)] <- c('Height', 'Type', 'Use')
	tbl$Height <- as.numeric(gsub('\\s.*', '', tbl$Height))
	tbl$Year[!grepl("\\d", tbl$Year)] <- NA
	tbl$Year <- as.numeric(sub("\\D.*", '', tbl$Year))

	qplot(data = tbl, x = Year, y = Height, color = Country)

	qplot(data = na.omit(tbl), x = Type, geom="bar")
	qplot(data = na.omit(tbl), x = Type, geom="bar", fill = Country)

	## Some linear models
	mtcars

	ggplot(data=mtcars, aes(factor(cyl), fill=factor(gear))) + geom_bar(position="dodge")

	lm(data = mtcars, mpg ~ hp)
	l <- lm(data = mtcars, mpg ~ hp)
	summary(l) # Look at summary(l)$r.squared
	qplot(data = mtcars, x = hp, y = mpg, geom="point")
	qplot(data = mtcars, x = hp, y = mpg, geom="point") + geom_smooth(method = 'lm')

	predict(l)
	predict(l, newdata = data.frame(hp = seq(50, 300, by = 25)))

	l <- lm(data = mtcars, mpg ~ hp + I(hp^2))
	summary(l)
	qplot(data = mtcars, hp, mpg, geom="point") + geom_line(data = data.frame(hp=seq(50,350, by=3), mpg=predict(l, data.frame(hp=seq(50, 350, by=3)))), aes(x=hp, y=mpg))

	l <- lm(data = mtcars, mpg ~ hp + I(hp^2) + I(hp^3) + I(hp^4) + I(hp^5) + I(hp^6) + I(hp^7) + I(hp^8) + I(hp^9) + I(hp^10))
	qplot(data = mtcars, hp, mpg, geom="point") + geom_line(data = data.frame(hp=seq(50,335, by=3), mpg=predict(l, data.frame(hp=seq(50, 335, by=3)))), aes(x=hp, y=mpg))

	l <- lm(data = mtcars, mpg ~ hp + wt + cyl)
	summary(l)
	qplot(data = mtcars, x = wt, y = mpg, geom="point") + geom_smooth(method = 'lm')

	## ... Incidentally...
	merge(tbl, xl, by.x='Country', by.y = 'row.names', all = F)
	# Pivots
	library(reshape)
	cast(df, Category ~ ., value='Mandates', fun.aggregate = sum, na.rm = T)
	cast(df, Category + Large ~ ., value='Mandates', fun.aggregate = length)