Skip to content

Instantly share code, notes, and snippets.

@n8thangreen
Last active December 15, 2016 09:48
Show Gist options
  • Save n8thangreen/3c1f746867390f2dd561951751988113 to your computer and use it in GitHub Desktop.
Save n8thangreen/3c1f746867390f2dd561951751988113 to your computer and use it in GitHub Desktop.
Imperial College London MSc Public Health practical
## ----setup, include=FALSE------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)
## ---- eval=T,echo=T------------------------------------------------------
a <- 1
b <- 2
c <- -1
d <- 2
# Check if two values are the same. Note the difference between
# == and a single =
a == b
# Check if a value is greater than another
# Note that you can store the result to a variable
result_1 <- d > c
print(result_1)
# Check if value less than or equal to
b <= d
# Check if a value does not equal another
c != b
# This also works with other simple data types, including chars
test_1 <- "apples"
test_2 <- "oranges"
test_1 != test_2
## ---- echo=T,eval=T------------------------------------------------------
# Create some fictional data
person_names <- c("frodo","samwise","peregrin","meriadoc","aragorn",
"gandalf","legolas","gimli","boromir")
races <- as.factor(c(rep("hobbit",4),"man","maia","elf","dwarf","man"))
# Note the use of `rep` to repeat hobbit 4 times
ages <- c(50,38,28,36,87,5000,500,139,40)
# Logical subsetting of a vector:
# Create a new variable which indicates whether or not each individual observation
# is greater than 50
indices <- ages > 50
print(indices)
# This prints the variable "person_names" when indices = TRUE,
# so when age > 50
print(person_names[indices])
# Or do this directly
print(ages[ages <= 50])
print(person_names[races == "hobbit"])
# You can also find the actual indices using the `which()` function
# This lists the rows in the dataset which satisfy the condition
# age <= 50
print(which(ages <= 50))
# Let's try subsetting a data frame. Note the argument to not automatically convert
# strings to factors when declaring the data frame:
fellowship <- data.frame("name"=person_names,"race"=races,"age"=ages,stringsAsFactors=FALSE)
# Note that we have to subset the correct column of the data frame with
# `fellowship$race`. This is essentially saying "find all rows of the
# fellowship$race vector that equal "hobbit", and use these to print
# only the corresponding rows of the fellowship data frame"
# Note the use of the comma to indicate that we want all columns
print(fellowship[fellowship$race=="hobbit",])
# The following commented out line WON'T work:
print(fellowship["race"=="hobbit",])
## ---- eval=T,echo=T------------------------------------------------------
print(1 < 5 & 1 > 0)
# You can use this to subset data as well:
young_hobbits <- fellowship[fellowship$race == "hobbit" & fellowship$age < 40,]
print(young_hobbits)
## ----echo=T,eval=T-------------------------------------------------------
# Notice that by providing a data frame or matrix with more than
# two columns, the plot function plots each column against
# every other column by default
melanoma_data <- read.csv(file="data/melanoma.csv",header=TRUE)
print(head(melanoma_data))
plot(melanoma_data)
# Plotting only specific columns is done by indexing
# Note the row index is blank here, so all rows are used
# but only the two specified columns
plot(melanoma_data[,c("year","incidence")])
## ---- echo=T,eval=T------------------------------------------------------
# Plot the same data but with a number of optional arguments.
# Notice that I can spread the function call over multiple
# lines as long as they end in a comma.
plot(melanoma_data[,c("year","incidence")],
xlab="Time in years",
ylab="Incidence"
)
## ---- echo=T,eval=T------------------------------------------------------
# Create a boxplot comparing data summaries between two
# time points
cd4_data <- read.csv(file="data/cd4.csv",header=TRUE)
boxplot(cd4_data[,c("baseline","oneyear")],
ylab="CD4 cell counts (in 100's)")
# Create a histogram to look at distribution of data
hist(cd4_data[,"baseline"],
main="Histogram of CD4 Cell count at Baseline",
ylab="Distribution of cell counts at baseline",
xlab="CD4 cell count (100's)")
## ---- echo=T,eval=T------------------------------------------------------
# Scatter plot of melanoma incidence (y) against year (x). Note the
# additional arguments for plot labels
plot(melanoma_data$year, melanoma_data$incidence,
main="Melanoma incidence over time",
xlab="Time in years",
ylab="Incidence"
)
# We use the `lines` function to add a line to the plot. The `lowess` function
# produces a smoothing line for the relationship between year and incidence,
# which we then add to the plot. Note that we have also used the `col` argument
# to change the line colour. Have a look at calling the lowess function without
# the lines function to see what it returns.
lines(lowess(melanoma_data$year,melanoma_data$incidence),
col="red")
## ---- echo=T,eval=T,error=TRUE,warning=TRUE------------------------------
# An example of a warning message. Note that NaN stands for not a number,
# meaning that the result doesn't make any mathematical sense
log(-2)
# A few examples of error messages - note that the messages are somewhat informative
exp("candy")
rnorm(NULL)
a[1,1]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment