Created
February 11, 2013 20:49
-
-
Save Nimster/4757490 to your computer and use it in GitHub Desktop.
An introduction to R, as presented at http://www.meetup.com/Big-Data-Israel/events/96536782/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######### Intro to R ############### | |
######### The Data Frame ########### | |
df <- data.frame( | |
row.names = c('HaLikud', 'Yesh Atid', 'HaAvoda', 'HaBait HaYehudi', 'Yehadut HaTora', 'Meretz', 'Shas'), | |
LeaderName = c('Netanyahu', 'Lapid', 'Yehimovitch', 'Bennet', 'Litzman', 'GalOn', 'Yishai'), | |
Category = c('Right', 'Center', 'Left', 'Right', 'Religious', 'Left', 'Religious'), | |
Mandates = c(31, 19, 15, 12, 7, 6, 11) | |
) | |
df | |
colnames(df) | |
rownames(df) | |
df[df$Category == 'Right', ] | |
df[df$Category == 'Right', "Mandates"] | |
df[df$Category == 'Right', c("Mandates", "LeaderName")] | |
df[df$Category == 'Right', c(3, 1)] | |
df[df$Category == 'Right', -2] | |
df[(df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15), ] | |
(df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15) | |
which( (df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15) ) | |
df[ which( (df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15) ) , ] | |
df[ which( (df$Category == 'Right' & df$Mandates > 20) | (df$Category == 'Left' & df$Mandates < 15) ) , "Mandates"] <- 999 | |
df[ df$Mandates > 500 , "Mandates"] <- NA | |
df[! is.na(df$Mandates), ] | |
df[df$Category %in% c('Right', 'Left'), ] | |
with(df, Mandates * 3) | |
Mandates # BOO; So how did this work? | |
# Rubyists: understand this as a ruby block (closure) | |
with(df, { print("HERE!") | |
Mandates * 3 }) | |
# But this is unique | |
transform(df, remaining = 61 - Mandates, logvoters = log(22500 * Mandates)) | |
## Factors | |
df$Category | |
## Matrices | |
matrix(c(1,2,3,4,5,6,7,8), nrow = 4, ncol = 2) | |
matrix(c(1,2,3,4,5,6,7,8), nrow = 4, ncol = 2) %*% matrix(c(1,2,3,-1,-2,-3), nrow = 2, ncol = 3) | |
## Vectors | |
1:8 | |
c(1:4, 10:15) | |
rep(seq(1, 2, by = 0.2), each = 3, times = 2) | |
## Time Series | |
ts(seq(100, 300, by = 5), start = 1960, freq = 12) | |
## Everything is vectorized | |
sqrt((1:10) ** 2) | |
df$Large <- ifelse(df$Mandates > 10, TRUE, FALSE) | |
df | |
rownames(df) <- paste(rownames(df), df$LeaderName, sep=' BeRashut ') | |
df | |
## R is functional | |
f <- function(x, y) { | |
(x + y) / (x - y) | |
} | |
f(3, 4) | |
f(2:3, 4:5) | |
outer(1:3, 4:6, FUN=f) | |
?binom.test | |
binom.test(50, 100) | |
ls(binom.test(50, 100)) | |
binom.test(50, 100)$conf.int | |
binom.test(50, 100)$conf.int[1] | |
binom.test(50, 100)$conf.int * 100 | |
as.vector(binom.test(50, 100)$conf.int * 100) | |
s <- function(x, n) { as.vector(binom.test(x, n)$conf.int * n) } | |
s(50, 100) | |
s(50:55, 100) # BOO | |
vs <- Vectorize(s, "x") | |
vs(50:55, 100) | |
x <- seq(10, 40, by = 10) | |
names(x) <- x | |
n <- seq(100, 400, by = 100) | |
names(n) <- n | |
vs2 <- Vectorize(s, c("x", "n")) | |
vs2(x, 100) | |
vs2(50, n) | |
vs2(x, n) # vs(x[1], n[1]), vs(x[2], n[2]), vs(x[3], n[3]), vs(x[4], n[4]) | |
# Like this? explore mapply, ddply (plyr package), etc. | |
## R integrates well | |
install.packages('gdata') | |
library(gdata) | |
xl <- read.xls('~/Downloads/eurostat_dirty.xlsx', sheet=1, na.strings=':', stringsAsFactors = F) | |
xl | |
xl <- xl[-(40:nrow(xl)), ] | |
rownames(xl) <- xl[, 1] | |
xl <- xl[, seq(2, ncol(xl), by = 2)] | |
xl <- xl[, -1] | |
colnames(xl) <- paste("Y", xl[2, ], sep='') | |
xl <- xl[-(1:4), ] | |
colSums(xl) | |
colSums(xl, na.rm = T) | |
xl <- na.omit(xl) | |
apply(xl, MARGIN=2, max) | |
apply(xl, MARGIN=2, mean) | |
summary(xl) | |
## Advanced data processing | |
xl$Bed_Category <- cut(xl$Y2011, c(0, 30, 100, Inf), labels = c("Little", "Medium", "Lots")) | |
?tapply | |
tapply(xl$Y2011, xl$Bed_Category, FUN = mean) | |
x <- c(rep(1:3, each = 3, times = 2)) | |
x | |
rle(x) | |
# Where to advertise? A multi-armed bandit approach. | |
sample(rownames(xl), 3, replace = T, xl$Y2011) | |
cut(xl$Y2001, 3) # Generate cut points automatically. Oh-oh | |
quantile(xl$Y2001, probs = c(0, 0.25, 0.75, 1)) | |
xl$Old_Bed_Category <- cut(xl$Y2001, | |
quantile(xl$Y2011, probs = c(0, 0.25, 0.75, 1)), | |
labels = c('Low', 'Medium', 'High'), | |
right = T, include.lowest = T) # Include both ends of the range | |
xl | |
# in R, the questions is often "What's the function that does *THAT*?" | |
ftable(xl[, c('Bed_Category', 'Old_Bed_Category')]) | |
## Riddle: How do I find the problematic 5? | |
## ... | |
## ... | |
## ... | |
rownames(xl[xl$Bed_Category == 'Little' & xl$Old_Bed_Category == 'Low', ]) | |
## More data plays | |
order(xl$Y2011) # Huh? | |
xl[order(xl$Y2011), ] # ahhhh | |
rank(xl$Y2011) # Inversed perm! | |
## Stats & Probability | |
runif(5, 0, 3) | |
rbinom(1, 100, 0.5) | |
hist(rbinom(10, 100, 0.5)) | |
hist(rbinom(100, 100, 0.5)) | |
hist(rbinom(1000, 100, 0.5)) # CLT! | |
hist(rnorm(1000, 1.5, 1)) | |
library(ggplot2) | |
l <- rnorm(1000, 1.5, 1) | |
p <- qplot(l, geom = 'histogram') | |
p | |
p + xlab("Coffee breaks per day") | |
## More cool IO | |
library(XML) | |
theurl <- "http://en.wikipedia.org/wiki/List_of_tallest_structures_in_the_world" | |
tables <- readHTMLTable(theurl) | |
n.rows <- unlist(lapply(tables, function(t) dim(t)[1])) | |
tbl <- tables[[which.max(n.rows)]] | |
tbl | |
tbl <- tbl[, 1:6] | |
colnames(tbl)[c(2, 4, 5)] <- c('Height', 'Type', 'Use') | |
tbl$Height <- as.numeric(gsub('\\s.*', '', tbl$Height)) | |
tbl$Year[!grepl("\\d", tbl$Year)] <- NA | |
tbl$Year <- as.numeric(sub("\\D.*", '', tbl$Year)) | |
qplot(data = tbl, x = Year, y = Height, color = Country) | |
qplot(data = na.omit(tbl), x = Type, geom="bar") | |
qplot(data = na.omit(tbl), x = Type, geom="bar", fill = Country) | |
## Some linear models | |
mtcars | |
ggplot(data=mtcars, aes(factor(cyl), fill=factor(gear))) + geom_bar(position="dodge") | |
lm(data = mtcars, mpg ~ hp) | |
l <- lm(data = mtcars, mpg ~ hp) | |
summary(l) # Look at summary(l)$r.squared | |
qplot(data = mtcars, x = hp, y = mpg, geom="point") | |
qplot(data = mtcars, x = hp, y = mpg, geom="point") + geom_smooth(method = 'lm') | |
predict(l) | |
predict(l, newdata = data.frame(hp = seq(50, 300, by = 25))) | |
l <- lm(data = mtcars, mpg ~ hp + I(hp^2)) | |
summary(l) | |
qplot(data = mtcars, hp, mpg, geom="point") + geom_line(data = data.frame(hp=seq(50,350, by=3), mpg=predict(l, data.frame(hp=seq(50, 350, by=3)))), aes(x=hp, y=mpg)) | |
l <- lm(data = mtcars, mpg ~ hp + I(hp^2) + I(hp^3) + I(hp^4) + I(hp^5) + I(hp^6) + I(hp^7) + I(hp^8) + I(hp^9) + I(hp^10)) | |
qplot(data = mtcars, hp, mpg, geom="point") + geom_line(data = data.frame(hp=seq(50,335, by=3), mpg=predict(l, data.frame(hp=seq(50, 335, by=3)))), aes(x=hp, y=mpg)) | |
l <- lm(data = mtcars, mpg ~ hp + wt + cyl) | |
summary(l) | |
qplot(data = mtcars, x = wt, y = mpg, geom="point") + geom_smooth(method = 'lm') | |
## ... Incidentally... | |
merge(tbl, xl, by.x='Country', by.y = 'row.names', all = F) | |
# Pivots | |
library(reshape) | |
cast(df, Category ~ ., value='Mandates', fun.aggregate = sum, na.rm = T) | |
cast(df, Category + Large ~ ., value='Mandates', fun.aggregate = length) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment