Created
May 30, 2012 19:45
-
-
Save Nimster/2838511 to your computer and use it in GitHub Desktop.
Quick reference to some common R idioms/capabilities - mainly to serve as a reminder for me
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Most of the summary is taken from the awesome R twotorials at http://www.twotorials.com/ by Anthony Damico | |
### Some of it are my additions from my experience. This is intended so you can Ctrl+F and find what you want using | |
### common names of functions and concepts from other languages or statistics. | |
### Troubleshooting: Search http://tolstoy.newcastle.edu.au/R/ , http://www.r-bloggers.com/, http://www.rseek.org/ | |
### Basics | |
traceback() # Get the call stack after an error, for debugging | |
32 %% 2 # == 0 mod operator | |
5 %/% 3 # == 1 integer division | |
options(digits=22) # Set max floating point precision | |
pi # 3.14... | |
Inf # Infinity | |
factorial(4) # 4! | |
z <- sqrt(16:20) # square root, variable assignment, ranges (sequences) - 16:20 becomes 16,17,18,19,20. | |
4:1 # == c(4,3,2,1). | |
z <- c(1, 2) + 5 # == c(6, 7): create a vector, add a number to both elements of the vector | |
z <- c(1,2,3,4) + c(1,2) # == c(2,4,4,6) : Automatically repeat the shorter vector | |
df <- data.frame(col1 = c(1,2,3), col2 = 3:5) # Create a dataframe | |
nrow(df), ncol(df) # size of the data frame (number of rows and columns) | |
nchar("a string") # == 8, length of string (number of characters) | |
length(1:8) # == 8, length of a vector (number of elements) | |
rownames(df), colnames(df) # Also assignable: rownames(df) <- c("row1", "row2") | |
df[df$col1 == 2, 1:2] # Index the second row of the data frame, and the first two columns. | |
df[, -4] # Remove the 4th column | |
df[, 'unwanted'] <- NULL # Remove the unwanted column | |
z<-list(1:3, c("a","b")) # Create a list with two objects, c(1,2,3) and c("a","b") | |
z[[2]] # Access the second member of the list z | |
4 %in% c(3,4,5) # True | |
ls() # See all of the defined variables in the environment. You can also specify the environment | |
rm(y) # Delete the 'y' variable from the environment. Run gc() to garbage collect and free the memory | |
assign("something", val) # equivalent to something <- val. A kind of reflection | |
get("something") # retrieving the 'something' variable. | |
source("script.R") # Execute the script.R file | |
matrix(NA, nrow = 4, ncol = 4) # Create a 4x4 matrix. Matrix multiplication is %*%. Put the data where NA is | |
ts(x, start = 1960, freq = 12) # Create an equispaced time-series vector from x. See as.ts as well | |
ISOdatetime(1970,1,1,0,0,0, tz="EST") + 1241204120 # Convert seconds since the epoch to time | |
### Control structures and basic language stuff | |
if (! (T & (F | T))) { | |
} else { | |
# We will get here. Note the single binary operators, and T,F are shorthands for True, False. | |
} | |
for (i in 1:3) { | |
# Happens 3 times | |
next # Like continue, starts in the next iteration | |
} | |
while (i < 18) { break } # While loops. Break out of a loop. repeat { } is infinite loop | |
# "exception" handling (try and catch) and ignoring errors: | |
result <- try( { 12 / 0 }, silent = T) # Will not inform you of the division by 0 | |
class(result) == "try-error" # There was an exception. In general class returns the type of an object | |
myfunc <- function(a, b, c=15) { # define a function, default values for parameters. | |
14 # last line is the return value | |
} | |
### Basic Functions | |
seq(from = 0, to = 3, by = 0.5) # gives 0,0.5,1,1.5,2,2.5,3. Also length.out = 7 instead of by | |
rep(1:3, 2) # == c(1,2,3,1,2,3) repeat the vector from the beginning N times, here N=2. | |
as.numeric(x), as.logical(x), as.character(x) # Convert (cast) types. 0 is false. | |
is.na(c(1,NA,3)) # == (F, T, F) whether a value is missing or not | |
ifelse(c(T, F), "true_case", "false_case") # == c("true_case", "false_case") (do an if-else on each member) | |
outer(1:2, 3:4, FUN = "*") # returns a matrix of applying FUN to each of the outer (cartesian) product elements. | |
# provide additional parameters to FUN after that parameter. | |
round(x, digits=2) # also floor(), ceiling() | |
Sys.time() # Current time | |
### String functions | |
gsub("regex", "gerex", y) # replace (regular expression). Use sub to replace just the first match | |
grep("el", c("hello", "elbow", "world")) # == c(1,2) search for the substring and return the matching indices | |
grepl("el", c("hello", "elbow", "world")) # == c(T,T,F) search for the substring and return a logical vector | |
paste("a", c("b", "c"), sep="") # == c("ab", "ac") (concatenate strings). collapse="X" to make it all one string | |
install.packages("stringr"); library(stringr) # And then you get access to: | |
str_trim(" as ", side="left") # == "as ": remove leading/training whitespaces. | |
strsplit(c("a-b", "c-d"), split="-") # Split the string, returns a list with the matches. Accepts regexs for split= | |
strptime(string, "%d-%m-%Y") # Takes a string and a format, returns that date as a date object you can add seconds to. | |
### Input / Output | |
inData <- read.csv("inputData.csv",header=T,stringsAsFactors=FALSE,na.strings = c("","999","—-","MISS")) # also quote=False | |
setwd("path/to/dir") # cd to another directory | |
# Read/Write Microsoft Excel spreadsheet files: | |
install.packages("gdata"); library(gdata) | |
read.xls("file.xlsx", sheet = 3) # Also sheet = "named sheet". Works for XLS or XLSX | |
# Alternatively | |
install.packages("xlsx"); library(xlsx) | |
write.xlsx(df, "file.xlsx", sheetName="mysheet") # And keep writing to other sheets with append = T | |
read.xlsx("file.xlsx", sheetIndex = 1) # or sheetIndex = "sheet name" | |
write.csv(df, "filename.csv") | |
save(df1, df2, file="filename.RData") # Save several things. Preferably use saveRDS(df, "name") for one item | |
load("filename.RData") # And load them | |
### Statistics and Probability | |
mean(x), median(x), range(x), sd(x), var(x) # average, median and c(min, max) (also use max(), min()), | |
# standard deviation and variance. All accept na.rm = T | |
summary(x) # Min, max, mean, median and 25,75 quantiles | |
cor(x, y, method = "pearson") # Pearson product correlation coefficient between x and y. Or just do cor(df) for a whole dataframe | |
quantile(x, probs = seq(0, 1, by = 0.05)) # all percentiles 0%, 5%, 10%, ..., up to 100% | |
ftable(df[, c("col1", "col2", "col3")]) # crosstabs (number of appearances) of values of col1, col2, col3 | |
# table also works, and you can useNA="always" to show NA values | |
unique(c(1,2,1,2,3,4)) # == c(3,4) : only the values that appear only once | |
tapply(3:6, c(1,2,1,2), mean) # returns a table where 1 => 4, 2 => 5. In general, | |
# tapply(data, grouped_by, FUN, extra_params...) to stratify FUN on the data over the groups | |
aggregate(col1 ~ col2, df, quantile, probs=0.9) # find the 90th percentile of col1 stratified (grouped) by col2 in df | |
sample(vec, 5, probs = c(0.1,0.3,0.1,0.5), replace = T) # Choose 5 items from the vector at random according to the given | |
# probabilities, with replacement. Without probs, sampling is uniform | |
set.seed(15) # set the seed for generating random numbers | |
runif(3, min = -1, max = 3) # 3 Uniformly distributed numbers on [-1, 3]. Also see rnorm(), rbinom(), etc. | |
l <- lm(x ~ y + z) # Create a linear regression model. Pass data = df to take the columns from a dataframe | |
residuals(l) # The residuals of the linear model | |
summary(l) # a summary of the model's goodness of fit. Also plot(l) to see a bunch of plots about the model | |
table(table(x)) # Frequency histogram: How many elements are unique? How many elements appear twice? etc. | |
### Playing with data | |
sort(x) # Returns the sorted values | |
order(x) # Returns the order statistics for each value. | |
df[order[df$column3],] # Sorts the data frame df by its "column3" column | |
merge(df1, df2, by = "col1") # (inner) join df1 and df2 on column col1 - no NAs are generated. See also | |
# by.x, by.y, all.x=T, all.y=T (or all=T) to control what gets joined and added | |
cbind(df1, df2) # If df1 and df2 have the same *number* of rows, paste them left to right | |
rbind(df1, df2) # If df1 and df2 have the same columns, return df1 followed by df2 | |
install.packages("reshape"); library(reshape) | |
rbind.fill(df1, df2) # Return df1 followed (on top of) df2, and fill NAs for missing columns | |
cut(1:6, c(1, 3, 5)) # == c(NA,(1,3],(1,3],(3,5],(3,5],NA). Specify a number instead of the breaks to cut to that | |
# many breaks. Specify labels=c("lab1,"lab2",...) to name the breaks instead of the default. | |
rle(c("a","a","b","a","a")) # == c(2,1,2), c("a","b","a") : The run-length encoding or consecutive chunks (batches) | |
# of groups of values from the input vector | |
### Case study: Comparing two ranked data frames | |
# Dataframe T1, sorted according to column X1 in dataframe T2 | |
t1.matched <- t1[match(t2$X1, t1$X1),] | |
# So the diffs are the differences of the values | |
diffs <- t1.matched$value - t2$value | |
# The top differences are: | |
t2[order(abs(diffs), decreasing=T),] | |
# To compare side by side, use | |
cbind(t2, t1.matched$value, data.frame(diffs))[order(abs(diffs), decreasing=T),] | |
# To see where the top items from t1 went in the re-ordering, use | |
match(t1$X1, t2$X1) | |
### Packages and Documentation | |
install.packages("packagename") | |
library(ggplot2) # What's the difference between this and require('ggplot2')? | |
help.search("functionName") # look up a function | |
?seq # or ??seq to get the help for the seq built-in | |
### Nifty stuff | |
install.packages("sqldf"); library(sqldf) | |
sqldf("select count(*), category from df where column1 > 12 group by category") # Run this SQL against df | |
history(max.show = 40) # show last 40 lines of readline history. Also savehistory("filename") | |
options() # List of all options. options(something = val) to change options like prompt, display limits etc. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment